Outline

1 - Preprocessing

Load All Packages at Glance

require(arules)
require(arulesViz)
require(data.table)

require(dplyr)
require(ggplot2)
require(data.table)
#require(xlsx)
require(readxl)
require(ggmap)
#require(proj4)
require("performanceEstimation")
require("e1071")
require("DMwR") 

Load Data

data_dir <- "../data"
df <- read.csv(paste(paste( data_dir, "Accidents_2015.csv", sep= "/" )), stringsAsFactors = FALSE)

# Show some info
(colnames(df))
##  [1] "Accident_Index"                             
##  [2] "Location_Easting_OSGR"                      
##  [3] "Location_Northing_OSGR"                     
##  [4] "Longitude"                                  
##  [5] "Latitude"                                   
##  [6] "Police_Force"                               
##  [7] "Accident_Severity"                          
##  [8] "Number_of_Vehicles"                         
##  [9] "Number_of_Casualties"                       
## [10] "Date"                                       
## [11] "Day_of_Week"                                
## [12] "Time"                                       
## [13] "Local_Authority_.District."                 
## [14] "Local_Authority_.Highway."                  
## [15] "X1st_Road_Class"                            
## [16] "X1st_Road_Number"                           
## [17] "Road_Type"                                  
## [18] "Speed_limit"                                
## [19] "Junction_Detail"                            
## [20] "Junction_Control"                           
## [21] "X2nd_Road_Class"                            
## [22] "X2nd_Road_Number"                           
## [23] "Pedestrian_Crossing.Human_Control"          
## [24] "Pedestrian_Crossing.Physical_Facilities"    
## [25] "Light_Conditions"                           
## [26] "Weather_Conditions"                         
## [27] "Road_Surface_Conditions"                    
## [28] "Special_Conditions_at_Site"                 
## [29] "Carriageway_Hazards"                        
## [30] "Urban_or_Rural_Area"                        
## [31] "Did_Police_Officer_Attend_Scene_of_Accident"
## [32] "LSOA_of_Accident_Location"

Show Some Details

(str(df))
## 'data.frame':    140056 obs. of  32 variables:
##  $ Accident_Index                             : chr  "201501BS70001" "201501BS70002" "201501BS70004" "201501BS70005" ...
##  $ Location_Easting_OSGR                      : int  525130 526530 524610 524420 524630 525480 526890 527590 524170 525010 ...
##  $ Location_Northing_OSGR                     : int  180050 178560 181080 181080 179040 179530 178940 178660 180930 181200 ...
##  $ Longitude                                  : num  -0.198 -0.179 -0.206 -0.208 -0.206 ...
##  $ Latitude                                   : num  51.5 51.5 51.5 51.5 51.5 ...
##  $ Police_Force                               : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Accident_Severity                          : int  3 3 3 3 2 3 3 3 3 3 ...
##  $ Number_of_Vehicles                         : int  1 1 1 1 2 2 2 2 2 2 ...
##  $ Number_of_Casualties                       : int  1 1 1 1 1 1 1 1 1 2 ...
##  $ Date                                       : chr  "12/01/2015" "12/01/2015" "12/01/2015" "13/01/2015" ...
##  $ Day_of_Week                                : int  2 2 2 3 6 5 5 1 6 3 ...
##  $ Time                                       : chr  "18:45" "07:50" "18:08" "07:40" ...
##  $ Local_Authority_.District.                 : int  12 12 12 12 12 12 12 12 12 12 ...
##  $ Local_Authority_.Highway.                  : chr  "E09000020" "E09000020" "E09000020" "E09000020" ...
##  $ X1st_Road_Class                            : int  5 6 4 4 3 3 3 6 5 6 ...
##  $ X1st_Road_Number                           : int  0 0 415 450 315 315 3218 0 0 0 ...
##  $ Road_Type                                  : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ Speed_limit                                : int  30 30 30 30 30 30 30 30 30 30 ...
##  $ Junction_Detail                            : int  3 3 2 6 6 3 6 0 3 3 ...
##  $ Junction_Control                           : int  4 4 4 4 2 4 2 -1 4 4 ...
##  $ X2nd_Road_Class                            : int  6 3 6 6 3 5 3 -1 6 6 ...
##  $ X2nd_Road_Number                           : int  0 3218 0 0 3220 0 3218 0 0 0 ...
##  $ Pedestrian_Crossing.Human_Control          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Pedestrian_Crossing.Physical_Facilities    : int  0 0 1 0 5 4 5 0 1 0 ...
##  $ Light_Conditions                           : int  4 1 4 1 1 1 1 1 1 1 ...
##  $ Weather_Conditions                         : int  1 1 2 1 2 1 8 1 1 8 ...
##  $ Road_Surface_Conditions                    : int  1 1 2 2 2 2 2 1 1 1 ...
##  $ Special_Conditions_at_Site                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Carriageway_Hazards                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Urban_or_Rural_Area                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Did_Police_Officer_Attend_Scene_of_Accident: int  1 1 1 2 2 1 1 1 1 1 ...
##  $ LSOA_of_Accident_Location                  : chr  "E01002825" "E01002820" "E01002833" "E01002874" ...
## NULL
(ncol(df))
## [1] 32
(nrow(df))
## [1] 140056

Some Preprocessing (Avoid Messy Columns Names)

## Change messy column names

setnames( df  , "Local_Authority_.District." , "Local_Authority_District" )
setnames( df  , "Local_Authority_.Highway."  , "Local_Authority_Highway"  )

setnames( df  , "X1st_Road_Class"            , "First_Road_Class"         )
setnames( df  , "X1st_Road_Number"           , "First_Road_Number"        )

setnames( df  , "X2nd_Road_Class"            , "Second_Road_Class"        )
setnames( df  , "X2nd_Road_Number"           , "Second_Road_Number"       )

setnames( df  , "Pedestrian_Crossing.Human_Control"  , "Pedestrian_Crossing-Human_Control"  )
setnames( df  , "Pedestrian_Crossing.Physical_Facilities"  , "Pedestrian_Crossing-Physical_Facilities"  )

(colnames(df))
##  [1] "Accident_Index"                             
##  [2] "Location_Easting_OSGR"                      
##  [3] "Location_Northing_OSGR"                     
##  [4] "Longitude"                                  
##  [5] "Latitude"                                   
##  [6] "Police_Force"                               
##  [7] "Accident_Severity"                          
##  [8] "Number_of_Vehicles"                         
##  [9] "Number_of_Casualties"                       
## [10] "Date"                                       
## [11] "Day_of_Week"                                
## [12] "Time"                                       
## [13] "Local_Authority_District"                   
## [14] "Local_Authority_Highway"                    
## [15] "First_Road_Class"                           
## [16] "First_Road_Number"                          
## [17] "Road_Type"                                  
## [18] "Speed_limit"                                
## [19] "Junction_Detail"                            
## [20] "Junction_Control"                           
## [21] "Second_Road_Class"                          
## [22] "Second_Road_Number"                         
## [23] "Pedestrian_Crossing-Human_Control"          
## [24] "Pedestrian_Crossing-Physical_Facilities"    
## [25] "Light_Conditions"                           
## [26] "Weather_Conditions"                         
## [27] "Road_Surface_Conditions"                    
## [28] "Special_Conditions_at_Site"                 
## [29] "Carriageway_Hazards"                        
## [30] "Urban_or_Rural_Area"                        
## [31] "Did_Police_Officer_Attend_Scene_of_Accident"
## [32] "LSOA_of_Accident_Location"

´

Access Number of Incomplete Cases

(apply(is.na(df), 2, sum))

Access Number of Levels

(apply( df, 2, function(x) length( unique(x) )))

# Quick view on data
(head(df))

Encode Some Columns Code Numbers Into Factors (lookup Table)

# Some dummy copy 
df_copy <- df 

# Lets map things according lookup table from spreadsheet
map <- data.table( number_sheet  = 3:21,
                   name_variable  = c( 'Police_Force',
                                       'Accident_Severity', 
                                       'Day_of_Week',
                                       'Local_Authority_District', 
                                       'Local_Authority_Highway',
                                       'First_Road_Class', 
                                       'Road_Type',
                                       'Junction_Detail',
                                       'Junction_Control',
                                       'Second_Road_Class', 
                                       'Pedestrian_Crossing-Human_Control',
                                       'Pedestrian_Crossing-Physical_Facilities',
                                       'Light_Conditions',
                                       'Weather_Conditions', 
                                       'Road_Surface_Conditions',
                                       'Special_Conditions_at_Site',
                                       'Carriageway_Hazards',
                                       'Urban_or_Rural_Area', 
                                       'Did_Police_Officer_Attend_Scene_of_Accident')
                   )

## Iterate

for( Index in map$number_sheet ){
  
  look<- data.table(read_excel(paste(data_dir, "Road-Accident-Safety-Data-Guide.xls" , sep= "/" ),
                              sheet = Index))
  
  names(look)[1]            <- tolower( names(look)[1])
  name_variable             <- map[ number_sheet == Index , name_variable ]
  df_copy [[name_variable]] <- look[ match( df[[name_variable]], look[['code']]) ]$label
}


# No need for df and look, lets remove from memory to save space
df <- df_copy

rm(look)
rm(df_copy)

# Quick view on data
(head(df))
##   Accident_Index Location_Easting_OSGR Location_Northing_OSGR Longitude
## 1  201501BS70001                525130                 180050 -0.198465
## 2  201501BS70002                526530                 178560 -0.178838
## 3  201501BS70004                524610                 181080 -0.205590
## 4  201501BS70005                524420                 181080 -0.208327
## 5  201501BS70008                524630                 179040 -0.206022
## 6  201501BS70009                525480                 179530 -0.193610
##   Latitude        Police_Force Accident_Severity Number_of_Vehicles
## 1 51.50554 Metropolitan Police            Slight                  1
## 2 51.49184 Metropolitan Police            Slight                  1
## 3 51.51491 Metropolitan Police            Slight                  1
## 4 51.51495 Metropolitan Police            Slight                  1
## 5 51.49657 Metropolitan Police           Serious                  2
## 6 51.50079 Metropolitan Police            Slight                  2
##   Number_of_Casualties       Date Day_of_Week  Time Local_Authority_District
## 1                    1 12/01/2015      Monday 18:45   Kensington and Chelsea
## 2                    1 12/01/2015      Monday 07:50   Kensington and Chelsea
## 3                    1 12/01/2015      Monday 18:08   Kensington and Chelsea
## 4                    1 13/01/2015     Tuesday 07:40   Kensington and Chelsea
## 5                    1 09/01/2015      Friday 07:30   Kensington and Chelsea
## 6                    1 15/01/2015    Thursday 09:56   Kensington and Chelsea
##   First_Road_Class First_Road_Number          Road_Type Speed_limit
## 1                C                 0 Single carriageway          30
## 2     Unclassified                 0 Single carriageway          30
## 3                B               415 Single carriageway          30
## 4                B               450 Single carriageway          30
## 5                A               315 Single carriageway          30
## 6                A               315 Single carriageway          30
##           Junction_Detail         Junction_Control Second_Road_Class
## 1 T or staggered junction Give way or uncontrolled      Unclassified
## 2 T or staggered junction Give way or uncontrolled                 A
## 3         Mini-roundabout Give way or uncontrolled      Unclassified
## 4              Crossroads Give way or uncontrolled      Unclassified
## 5              Crossroads      Auto traffic signal                 A
## 6 T or staggered junction Give way or uncontrolled                 C
##   Second_Road_Number Pedestrian_Crossing-Human_Control
## 1                  0             None within 50 metres
## 2               3218             None within 50 metres
## 3                  0             None within 50 metres
## 4                  0             None within 50 metres
## 5               3220             None within 50 metres
## 6                  0             None within 50 metres
##                                     Pedestrian_Crossing-Physical_Facilities
## 1                          No physical crossing facilities within 50 metres
## 2                          No physical crossing facilities within 50 metres
## 3                                                                     Zebra
## 4                          No physical crossing facilities within 50 metres
## 5                               Pedestrian phase at traffic signal junction
## 6 Pelican, puffin, toucan or similar non-junction pedestrian light crossing
##        Light_Conditions    Weather_Conditions Road_Surface_Conditions
## 1 Darkness - lights lit    Fine no high winds                     Dry
## 2              Daylight    Fine no high winds                     Dry
## 3 Darkness - lights lit Raining no high winds             Wet or damp
## 4              Daylight    Fine no high winds             Wet or damp
## 5              Daylight Raining no high winds             Wet or damp
## 6              Daylight    Fine no high winds             Wet or damp
##   Special_Conditions_at_Site Carriageway_Hazards Urban_or_Rural_Area
## 1                       None                None               Urban
## 2                       None                None               Urban
## 3                       None                None               Urban
## 4                       None                None               Urban
## 5                       None                None               Urban
## 6                       None                None               Urban
##   Did_Police_Officer_Attend_Scene_of_Accident LSOA_of_Accident_Location
## 1                                         Yes                 E01002825
## 2                                         Yes                 E01002820
## 3                                         Yes                 E01002833
## 4                                          No                 E01002874
## 5                                          No                 E01002814
## 6                                         Yes                 E01002816

Some Features Have Many Factors, Other Dont Contain Any Relevant Info.

  • (Indentify those were none/unknow/unclassified, etc are more frequent and take some action if needed.)
for ( Index in map$number_sheet ){
  name_variable   <- map[ number_sheet == Index , name_variable ]
  cat ( paste0( "\n\n",name_variable, ":" ))
  print ( table( df[[name_variable]] ))
}
## 
## 
## Police_Force:
##     Avon and Somerset          Bedfordshire        Cambridgeshire 
##                  3000                  1478                  1968 
##               Central              Cheshire        City of London 
##                   508                  2322                   346 
##             Cleveland               Cumbria            Derbyshire 
##                   941                  1293                  2147 
##    Devon and Cornwall                Dorset Dumfries and Galloway 
##                  3856                  1777                   276 
##                Durham           Dyfed-Powys                 Essex 
##                  1073                  1325                  3746 
##                  Fife       Gloucestershire              Grampian 
##                   428                   815                   656 
##    Greater Manchester                 Gwent             Hampshire 
##                  3073                   689                  4365 
##         Hertfordshire            Humberside                  Kent 
##                  2510                  2394                  4878 
##            Lancashire        Leicestershire          Lincolnshire 
##                  3711                  2249                  2134 
##   Lothian and Borders            Merseyside   Metropolitan Police 
##                  2083                  2706                 24886 
##               Norfolk           North Wales       North Yorkshire 
##                  1749                  1333                  2062 
##      Northamptonshire              Northern           Northumbria 
##                  1323                   448                  3229 
##       Nottinghamshire           South Wales       South Yorkshire 
##                  2750                  2196                  3066 
##         Staffordshire           Strathclyde               Suffolk 
##                  2599                  3590                  1486 
##                Surrey                Sussex               Tayside 
##                  3858                  4302                   475 
##         Thames Valley          Warwickshire           West Mercia 
##                  5190                  1504                  2297 
##         West Midlands        West Yorkshire             Wiltshire 
##                  6096                  5286                  1584 
## 
## 
## Accident_Severity:
##   Fatal Serious  Slight 
##    1616   20038  118402 
## 
## 
## Day_of_Week:
##    Friday    Monday  Saturday    Sunday  Thursday   Tuesday Wednesday 
##     22374     20032     18114     15258     21479     21431     21368 
## 
## 
## Local_Authority_District:
##                Aberdeen City                Aberdeenshire 
##                          227                          347 
##                         Adur                    Allerdale 
##                          156                          247 
##                 Amber Valley                        Angus 
##                          213                          144 
##              Argyll and Bute                         Arun 
##                          226                          313 
##                     Ashfield                      Ashford 
##                          269                          341 
##               Aylesbury Vale                      Babergh 
##                          361                          215 
##         Barking and Dagenham                       Barnet 
##                          482                         1062 
##                     Barnsley            Barrow-in-Furness 
##                          499                          144 
##                     Basildon        Basingstoke and Deane 
##                          389                          340 
##                    Bassetlaw Bath and North East Somerset 
##                          284                          303 
##                      Bedford                       Bexley 
##                          399                          437 
##                   Birmingham                        Blaby 
##                         2970                          218 
##        Blackburn with Darwen                    Blackpool 
##                          398                          375 
##                Blaenau Gwent                     Bolsover 
##                           64                          177 
##                       Bolton                       Boston 
##                          373                          205 
##                  Bournemouth             Bracknell Forest 
##                          489                          212 
##                     Bradford                    Braintree 
##                         1235                          302 
##                    Breckland                        Brent 
##                          278                          907 
##                    Brentwood                     Bridgend 
##                          187                          232 
##            Brighton and Hove             Bristol, City of 
##                          780                          942 
##                    Broadland                      Bromley 
##                          230                          792 
##                   Bromsgrove                   Broxbourne 
##                          181                          200 
##                     Broxtowe                      Burnley 
##                          223                          249 
##                         Bury                   Caerphilly 
##                          179                          189 
##                   Calderdale                    Cambridge 
##                          411                          346 
##                       Camden                Cannock Chase 
##                          936                          207 
##                   Canterbury                      Cardiff 
##                          371                          622 
##                     Carlisle              Carmarthenshire 
##                          256                          436 
##                 Castle Point         Central Bedfordshire 
##                          171                          609 
##                   Ceredigion                    Charnwood 
##                          177                          303 
##                   Chelmsford                   Cheltenham 
##                          360                          131 
##                     Cherwell                Cheshire East 
##                          408                          837 
##    Cheshire West and Chester                 Chesterfield 
##                          756                          194 
##                   Chichester                     Chiltern 
##                          347                          120 
##                      Chorley                 Christchurch 
##                          241                          113 
##               City of London             Clackmannanshire 
##                          344                           62 
##                   Colchester                        Conwy 
##                          385                          250 
##                     Copeland                        Corby 
##                          166                           90 
##                     Cornwall                     Cotswold 
##                         1229                          142 
##                County Durham                     Coventry 
##                          866                          688 
##                       Craven                      Crawley 
##                          161                          306 
##                      Croydon                      Dacorum 
##                          896                          233 
##                   Darlington                     Dartford 
##                          207                          415 
##                     Daventry                 Denbighshire 
##                          188                          227 
##                        Derby             Derbyshire Dales 
##                          583                          185 
##                    Doncaster                        Dover 
##                          855                          248 
##                       Dudley        Dumfries and Galloway 
##                          432                          276 
##                  Dundee City                       Ealing 
##                          131                         1014 
##                East Ayrshire          East Cambridgeshire 
##                          204                          145 
##                   East Devon                  East Dorset 
##                          301                          189 
##          East Dunbartonshire               East Hampshire 
##                           95                          234 
##           East Hertfordshire                 East Lindsey 
##                          235                          447 
##                 East Lothian        East Northamptonshire 
##                          158                          119 
##            East Renfrewshire     East Riding of Yorkshire 
##                           94                          762 
##           East Staffordshire                   Eastbourne 
##                          344                          245 
##                    Eastleigh                         Eden 
##                          274                          203 
##           Edinburgh, City of                    Elmbridge 
##                         1111                          412 
##                      Enfield                Epping Forest 
##                          789                          384 
##              Epsom and Ewell                      Erewash 
##                          182                          216 
##                       Exeter                      Falkirk 
##                          262                          249 
##                      Fareham                      Fenland 
##                          250                          189 
##                         Fife                   Flintshire 
##                          428                          249 
##                 Forest Heath               Forest of Dean 
##                          128                          100 
##                        Fylde                    Gateshead 
##                          215                          517 
##                      Gedling                 Glasgow City 
##                          191                         1198 
##                   Gloucester                      Gosport 
##                          173                          138 
##                    Gravesham               Great Yarmouth 
##                          269                          162 
##                    Greenwich                    Guildford 
##                          646                          563 
##                      Gwynedd                      Hackney 
##                          246                          857 
##                       Halton                    Hambleton 
##                          224                          253 
##       Hammersmith and Fulham                   Harborough 
##                          614                          182 
##                     Haringey                       Harlow 
##                          881                          127 
##                    Harrogate                       Harrow 
##                          457                          424 
##                         Hart                   Hartlepool 
##                          211                          136 
##                     Hastings                       Havant 
##                          237                          223 
##                     Havering     Herefordshire, County of 
##                          632                          400 
##                    Hertsmere                    High Peak 
##                          364                          182 
##                     Highland                   Hillingdon 
##                          379                          724 
##        Hinckley and Bosworth                      Horsham 
##                          200                          309 
##                     Hounslow              Huntingdonshire 
##                          842                          378 
##                     Hyndburn                   Inverclyde 
##                          208                          109 
##                      Ipswich             Isle of Anglesey 
##                          260                          100 
##                Isle of Wight                    Islington 
##                          315                          844 
##       Kensington and Chelsea                    Kettering 
##                          632                          148 
## King's Lynn and West Norfolk  Kingston upon Hull, City of 
##                          272                          748 
##         Kingston upon Thames                     Kirklees 
##                          332                          944 
##                     Knowsley                      Lambeth 
##                          271                         1226 
##                    Lancaster                        Leeds 
##                          388                         1979 
##                    Leicester                        Lewes 
##                          845                          253 
##                     Lewisham                    Lichfield 
##                          884                          227 
##                      Lincoln                    Liverpool 
##                          303                         1137 
##    London Airport (Heathrow)                        Luton 
##                           39                          470 
##                    Maidstone                       Maldon 
##                          443                          101 
##                Malvern Hills                   Manchester 
##                          171                          723 
##                    Mansfield                       Medway 
##                          252                          656 
##                       Melton                       Mendip 
##                          113                          188 
##               Merthyr Tydfil                       Merton 
##                           99                          497 
##                    Mid Devon                  Mid Suffolk 
##                          138                          203 
##                   Mid Sussex                Middlesbrough 
##                          363                          304 
##                   Midlothian                Milton Keynes 
##                          190                          641 
##                  Mole Valley                Monmouthshire 
##                          291                          126 
##                        Moray            Neath Port Talbot 
##                           82                          216 
##                   New Forest          Newark and Sherwood 
##                          425                          321 
##          Newcastle upon Tyne         Newcastle-under-Lyme 
##                          765                          275 
##                       Newham                      Newport 
##                          881                          232 
##               North Ayrshire                  North Devon 
##                          190                          217 
##                 North Dorset        North East Derbyshire 
##                          130                          204 
##      North East Lincolnshire          North Hertfordshire 
##                          435                          290 
##               North Kesteven            North Lanarkshire 
##                          287                          447 
##           North Lincolnshire                North Norfolk 
##                          449                          163 
##               North Somerset               North Tyneside 
##                          339                          447 
##           North Warwickshire    North West Leicestershire 
##                          315                          219 
##                  Northampton               Northumberland 
##                          432                          716 
##                      Norwich                   Nottingham 
##                          374                          956 
##        Nuneaton and Bedworth            Oadby and Wigston 
##                          226                           76 
##                       Oldham               Orkney Islands 
##                          289                           12 
##                       Oxford                Pembrokeshire 
##                          409                          284 
##                       Pendle            Perth and Kinross 
##                          172                          200 
##                 Peterborough                     Plymouth 
##                          558                          619 
##                        Poole                   Portsmouth 
##                          314                          591 
##                        Powys                      Preston 
##                          428                          387 
##                      Purbeck                      Reading 
##                          142                          348 
##                    Redbridge         Redcar and Cleveland 
##                          718                          190 
##                     Redditch         Reigate and Banstead 
##                          112                          478 
##                 Renfrewshire         Rhondda, Cynon, Taff 
##                          258                          395 
##                Ribble Valley         Richmond upon Thames 
##                          118                          401 
##                Richmondshire                     Rochdale 
##                          158                          249 
##                     Rochford                   Rossendale 
##                          138                          145 
##                       Rother                    Rotherham 
##                          292                          574 
##                        Rugby                    Runnymede 
##                          330                          371 
##                   Rushcliffe                     Rushmoor 
##                          254                          184 
##                      Rutland                      Ryedale 
##                           93                          149 
##                      Salford                     Sandwell 
##                          284                          651 
##                  Scarborough             Scottish Borders 
##                          245                          220 
##                    Sedgemoor                       Sefton 
##                          243                          476 
##                        Selby                    Sevenoaks 
##                          190                          380 
##                    Sheffield                      Shepway 
##                         1138                          283 
##             Shetland Islands                   Shropshire 
##                           25                          571 
##                       Slough                     Solihull 
##                          465                          297 
##               South Ayrshire                  South Bucks 
##                          192                          225 
##         South Cambridgeshire             South Derbyshire 
##                          352                          193 
##        South Gloucestershire                   South Hams 
##                          394                          211 
##                South Holland               South Kesteven 
##                          243                          359 
##               South Lakeland            South Lanarkshire 
##                          277                          459 
##                South Norfolk       South Northamptonshire 
##                          270                          198 
##            South Oxfordshire                 South Ribble 
##                          331                          312 
##               South Somerset          South Staffordshire 
##                          323                          316 
##               South Tyneside                  Southampton 
##                          253                          594 
##              Southend-on-Sea                    Southwark 
##                          393                          907 
##                   Spelthorne                   St. Albans 
##                          308                          333 
##              St. Edmundsbury                   St. Helens 
##                          194                          317 
##                     Stafford      Staffordshire Moorlands 
##                          322                          212 
##                    Stevenage                     Stirling 
##                          167                          197 
##                    Stockport             Stockton-on-Tees 
##                          230                          311 
##               Stoke-on-Trent          Stratford-upon-Avon 
##                          579                          330 
##                       Stroud              Suffolk Coastal 
##                          132                          271 
##                   Sunderland                 Surrey Heath 
##                          531                          249 
##                       Sutton                        Swale 
##                          322                          361 
##                      Swansea                      Swindon 
##                          451                          448 
##                     Tameside                     Tamworth 
##                          238                          117 
##                    Tandridge                Taunton Deane 
##                          410                          192 
##                  Teignbridge           Telford and Wrekin 
##                          313                          227 
##                     Tendring                  Test Valley 
##                          290                          263 
##                   Tewkesbury                       Thanet 
##                          137                          430 
##        The Vale of Glamorgan                 Three Rivers 
##                          181                          190 
##                     Thurrock        Tonbridge and Malling 
##                          344                          366 
##                       Torbay                      Torfaen 
##                          281                           78 
##                     Torridge                Tower Hamlets 
##                          140                         1066 
##                     Trafford              Tunbridge Wells 
##                          260                          315 
##                   Uttlesford          Vale of White Horse 
##                          175                          281 
##                    Wakefield                      Walsall 
##                          717                          523 
##               Waltham Forest                   Wandsworth 
##                          642                          984 
##                   Warrington                      Warwick 
##                          505                          303 
##                      Watford                      Waveney 
##                          203                          215 
##                     Waverley                      Wealden 
##                          326                          439 
##               Wellingborough              Welwyn Hatfield 
##                          148                          295 
##               West Berkshire                   West Devon 
##                          294                          145 
##                  West Dorset          West Dunbartonshire 
##                          277                          118 
##              West Lancashire                 West Lindsey 
##                          264                          290 
##                 West Lothian             West Oxfordshire 
##                          404                          194 
##                West Somerset                Western Isles 
##                           76                           32 
##                  Westminster        Weymouth and Portland 
##                         1578                          123 
##                        Wigan                    Wiltshire 
##                          248                         1136 
##                   Winchester       Windsor and Maidenhead 
##                          323                          338 
##                       Wirral                       Woking 
##                          505                          268 
##                    Wokingham                Wolverhampton 
##                          264                          535 
##                    Worcester                     Worthing 
##                          162                          262 
##                      Wrexham                     Wychavon 
##                          261                          266 
##                      Wycombe                         Wyre 
##                          299                          239 
##                  Wyre Forest                         York 
##                          207                          449 
## 
## 
## Local_Authority_Highway:< table of extent 0 >
## 
## 
## First_Road_Class:
##            A         A(M)            B            C     Motorway Unclassified 
##        64280          402        17217        11069         5148        41940 
## 
## 
## Road_Type:
##   Dual carriageway     One way street         Roundabout Single carriageway 
##              20266               2873               9589             105066 
##          Slip road            Unknown 
##               1456                806 
## 
## 
## Junction_Detail:
##                          Crossroads        Data missing or out of range 
##                               13762                                   1 
##                     Mini-roundabout   More than 4 arms (not roundabout) 
##                                1900                                 999 
## Not at junction or within 20 metres                      Other junction 
##                               55927                                3103 
##           Private drive or entrance                          Roundabout 
##                                5252                               12423 
##                           Slip road             T or staggered junction 
##                                2032                               44657 
## 
## 
## Junction_Control:
##            Authorised person          Auto traffic signal 
##                          172                        14456 
## Data missing or out of range     Give way or uncontrolled 
##                        56070                        68655 
##                    Stop sign 
##                          703 
## 
## 
## Second_Road_Class:
##            A         A(M)            B            C     Motorway Unclassified 
##        13801           96         5626         5991          851        57088 
## 
## 
## Pedestrian_Crossing-Human_Control:
## Control by other authorised person  Control by school crossing patrol 
##                                433                                282 
##       Data missing or out of range              None within 50 metres 
##                                140                             139201 
## 
## 
## Pedestrian_Crossing-Physical_Facilities:
##                                                            Central refuge 
##                                                                      3157 
##                                              Data missing or out of range 
##                                                                       127 
##                                                      Footbridge or subway 
##                                                                       392 
##                          No physical crossing facilities within 50 metres 
##                                                                    113486 
##                               Pedestrian phase at traffic signal junction 
##                                                                     10865 
## Pelican, puffin, toucan or similar non-junction pedestrian light crossing 
##                                                                      7751 
##                                                                     Zebra 
##                                                                      4278 
## 
## 
## Light_Conditions:
## Darkness - lighting unknown       Darkness - lights lit 
##                        1920                       27551 
##     Darkness - lights unlit      Darkness - no lighting 
##                         719                        7258 
##                    Daylight 
##                      102608 
## 
## 
## Weather_Conditions:
##     Fine + high winds    Fine no high winds           Fog or mist 
##                  2353                113949                   662 
##                 Other  Raining + high winds Raining no high winds 
##                  1986                  2365                 15559 
##  Snowing + high winds Snowing no high winds               Unknown 
##                   175                   540                  2467 
## 
## 
## Road_Surface_Conditions:
## Data missing or out of range                          Dry 
##                          284                       101355 
##         Flood over 3cm. deep                 Frost or ice 
##                          184                         1461 
##                         Snow                  Wet or damp 
##                          535                        36237 
## 
## 
## Special_Conditions_at_Site:
##                 Auto signal part defective 
##                                         51 
##                  Auto traffic signal - out 
##                                        212 
##               Data missing or out of range 
##                                        106 
##                                        Mud 
##                                        424 
##                                       None 
##                                     136823 
##                              Oil or diesel 
##                                        384 
## Road sign or marking defective or obscured 
##                                        160 
##                     Road surface defective 
##                                        299 
##                                  Roadworks 
##                                       1597 
## 
## 
## Carriageway_Hazards:
## Any animal in carriageway (except ridden horse) 
##                                             610 
##                    Data missing or out of range 
##                                              95 
##                                            None 
##                                          137749 
##                            Other object on road 
##                                             929 
##         Pedestrian in carriageway - not injured 
##                                             307 
##                               Previous accident 
##                                             204 
##                            Vehicle load on road 
##                                             162 
## 
## 
## Urban_or_Rural_Area:
## Rural Urban 
## 47976 92080 
## 
## 
## Did_Police_Officer_Attend_Scene_of_Accident:
##                                                                       No 
##                                                                    27064 
## No - accident was reported using a self completion  form (self rep only) 
##                                                                      377 
##                                                                      Yes 
##                                                                   112607

2 - Exploratory Analysis

Lets Analyse The Road Type Where Large Number of Vehicles are Involved

th <- 9
# Subset to number of vehicles larger that threshold
df_large_crashes <- df["Number_of_Vehicles" > th, ]
barplot(prop.table(table(df_large_crashes$Road_Type)), las=2, cex.names=.5)

*Note: Its kind of obvious that highways and single Carriage account for the majority of cars involved

Lets Add Speed Limit to the Equation

# Adding Spped_limit 
(df_large_crashes %>% group_by(Road_Type, Speed_limit) %>% 
    tally() %>% 
    arrange(desc(n)))
## # A tibble: 37 x 3
## # Groups:   Road_Type [6]
##    Road_Type          Speed_limit     n
##    <chr>                    <int> <int>
##  1 Single carriageway          30 74667
##  2 Single carriageway          60 17081
##  3 Dual carriageway            70  8527
##  4 Single carriageway          40  6691
##  5 Roundabout                  30  6117
##  6 Dual carriageway            30  5524
##  7 Single carriageway          20  3575
##  8 Dual carriageway            40  3468
##  9 Single carriageway          50  3051
## 10 One way street              30  2489
## # … with 27 more rows

*Note: High Speed limits account for majority of cars involved (obvious). Considering the close proximity, the Road Type (Dual Carriage) and the Speed limit(70), we may find some areas were major crash in series occurs. Perhaps some more attention is needed to those spots by local Authorities.

Lets Bring the Area Density for a Detailed Analysis

load(file= "map.Rdata")

UK <- get_map(location = 'England', zoom=8)
map <- ggmap(UK)

´

Some Encoding of (Dates, Day Time, Season, Number Causalities

# Conver to datatable for easy encoding
df_copy <-data.table(df)

# Dummy copy
df_copy[, Accident_Index := NULL]


## Convert factor date to date 
df_copy[, Date       := as.Date( as.character( Date ), format = "%d/%m/%Y" ) ]

df_copy[, Day_of_year := as.Date( format( Date, "%m-%d" ), format = "%m-%d" )]

# Now as factors (data as Winter, Spring, Summer, Fall) (To be precise in the days)
df_copy[, Season      := ifelse( Day_of_year >= as.Date( "03-20", format = "%m-%d" ) & Day_of_year < as.Date( "06-21", format = "%m-%d" ) ,"Spring",
                                 ifelse( Day_of_year >= as.Date( "06-21", format = "%m-%d" ) & Day_of_year < as.Date( "09-22", format = "%m-%d" ) ,"Summer",
                                         ifelse( Day_of_year >= as.Date( "09-22", format = "%m-%d" ) & Day_of_year <= as.Date( "12-21", format = "%m-%d" ) ,"Fall", "Winter")))]

# Get the month of the year
df_copy[, c( "Month", "Day_of_year" ) := list( factor( format( Date, "%b" )),NULL ) ]

# Conver hour:minutes only to hours
df_copy[, Hour := as.numeric( substr( Time,1,2 )) ]

# Convert time to Morning, Afternoon, Night and Evening (Reduce Granularity)
breaks <- c( 0, 6, 12, 18, 23 )
labels <- c( "Night", "Morning", "Afternoon", "Evening" )
df_copy[, Day_Period := cut( as.numeric( Hour ), breaks = breaks,
                             labels = labels        , include.lowest=TRUE )]

# Check conversion CHECK
(apply(is.na(df_copy), 2, sum)) # Time 18 not being right converted
##                       Location_Easting_OSGR 
##                                          27 
##                      Location_Northing_OSGR 
##                                          27 
##                                   Longitude 
##                                          27 
##                                    Latitude 
##                                          27 
##                                Police_Force 
##                                           0 
##                           Accident_Severity 
##                                           0 
##                          Number_of_Vehicles 
##                                           0 
##                        Number_of_Casualties 
##                                           0 
##                                        Date 
##                                           0 
##                                 Day_of_Week 
##                                           0 
##                                        Time 
##                                           0 
##                    Local_Authority_District 
##                                           0 
##                            First_Road_Class 
##                                           0 
##                           First_Road_Number 
##                                           0 
##                                   Road_Type 
##                                           0 
##                                 Speed_limit 
##                                           0 
##                             Junction_Detail 
##                                           0 
##                            Junction_Control 
##                                           0 
##                           Second_Road_Class 
##                                       56603 
##                          Second_Road_Number 
##                                           0 
##           Pedestrian_Crossing-Human_Control 
##                                           0 
##     Pedestrian_Crossing-Physical_Facilities 
##                                           0 
##                            Light_Conditions 
##                                           0 
##                          Weather_Conditions 
##                                           0 
##                     Road_Surface_Conditions 
##                                           0 
##                  Special_Conditions_at_Site 
##                                           0 
##                         Carriageway_Hazards 
##                                           0 
##                         Urban_or_Rural_Area 
##                                           0 
## Did_Police_Officer_Attend_Scene_of_Accident 
##                                           8 
##                   LSOA_of_Accident_Location 
##                                           0 
##                                      Season 
##                                           0 
##                                       Month 
##                                           0 
##                                        Hour 
##                                          18 
##                                  Day_Period 
##                                          18
# Factorize number casualties ("Small", "Medium", "Large", "Overweelming")
breaks <- c( "0", "2", "4", "12", "inf" )
labels <- c( "Small", "Medium", "Large", "Overweelming" )

df_copy[, "Casualties_Class" := cut( Number_of_Casualties , breaks = breaks    ,
                                   labels = labels      , include.lowest=TRUE )]


# Summarize number of vehicles to (small, medium, high)
breaks <- c( "0", "2", "8",  "inf" )
labels <- c( "Small", "Medium", "Large" )
df_copy [, "Number_Vehicles_Class" := cut( Number_of_Vehicles, breaks = breaks,
                                  labels=labels     , include.lowest=TRUE )]

df_copy$Speed_limit  <- as.factor( df_copy$Speed_limit )


# Local copy to maintain the analisys further in the same domain.
df_subset <- df_copy

Histogram of Accident Severity

(table(df_subset$Accident_Severity))
## 
##   Fatal Serious  Slight 
##    1616   20038  118402
barplot(prop.table(table(df_subset$Accident_Severity)),las=2, cex.names=.5)

*Note: Majority of accidents are minor

Histogram of Number os Causualties per accident

barplot(prop.table(table(df_subset$Number_of_Casualties)), cex.names=.5)

  • Note: Major number casualties are 1 or 2

Do Some Correlation Beetween Causalities Class and Accident Severity

ggplot(df_subset, aes(Casualties_Class,fill =Accident_Severity)) + 
  geom_bar()+ 
  theme_bw() +
  geom_bar( position = "fill")+
  theme(axis.text.x=element_text(angle=90, hjust=1))

*Note: We observer that when a number of veichles are involved, the larger the number of deaths (highways)

Show the histogram of the acccidents by day of week

barplot(prop.table(table(df_subset$Day_of_Week)), las=2, cex.names=.5)

*Note: On Sundays we see lesst accidents, (people stay at home or drive more relaxed with family). Also we some increase on fridays, some hurry to reach home, our madness of the start of the weekend..

Correlated Accident Severety with Day of the Week

ggplot( df_subset, aes( Day_of_Week, fill = Accident_Severity ))+ 
  geom_bar()+ 
  theme_bw()+
  geom_bar( position = "fill")+
  theme(axis.text.x=element_text(angle=90, hjust=1))

  • Note: Nothing relevant to observe considering the fatalities group

Show Commnon Speed Limit at Accident Site

barplot(prop.table(table(df_subset$Speed_limit)), las=2, cex.names=.5)

Correlated Accident Severety with Speed Limit

ggplot(df_subset, aes(Speed_limit,fill =Accident_Severity))+
  geom_bar()+
  theme_bw()+
  geom_bar( position = "fill")+
  theme_bw()+
  theme(axis.text.x=element_text(angle=90, hjust=1))

*Note: Interesting, lower speed limits have large count of accidents (local areas)

Show the road specs were occur more accidents

barplot(prop.table(table(df_subset$Road_Type)), las=2, cex.names=.5)

Correlated Accident severety with Road Type

ggplot(df_subset, aes(Road_Type,fill =Accident_Severity)) + 
  geom_bar()+ 
  theme_bw()+
  geom_bar( position = "fill")+ 
  theme_bw()+ 
  theme(axis.text.x=element_text(angle=90, hjust=1))

*Note: Single Carriage accouint majority of major accidents and fatal victims

Show More Common Road Surface Conditions

barplot(prop.table(table(df_subset$Road_Surface_Conditions)), las=2, cex.names=.5)

Correlated Accident severety with Road Surface Conditions

ggplot(df_subset, aes(Road_Surface_Conditions,fill =Accident_Severity))+
  geom_bar()+
  theme_bw()+
  geom_bar( position = "fill")+ 
  theme_bw()+ 
  theme(axis.text.x=element_text(angle=90, hjust=1))

*Note: Most crashed occur on dry or wet conditions, severe dont account victims (people are careful)

Show the Seasons Accident Histogram

barplot(prop.table(table(df_subset$Season)), las=2, cex.names=.5)

Correlated Accident Severety with Season

ggplot(df_subset, aes(Season,fill =Accident_Severity))+
  geom_bar()+
  theme_bw()+
  geom_bar( position = "fill")+
  theme_bw()+
  theme(axis.text.x=element_text(angle=90, hjust=1))

*Note: Nothing relevant to conclude, summer and fall acocunt more accidents..

Accident by Month

barplot(prop.table(table(df_subset$Month)), las=2, cex.names=.5)

Correlated Accident Severety with Month

ggplot(df_subset, aes(Month,fill =Accident_Severity))+
  geom_bar()+
  theme_bw()+
  geom_bar( position = "fill")+ 
  theme_bw()+ 
  theme(axis.text.x=element_text(angle=90, hjust=1))

*Note: Slight more accident in july and november

Accident by Hour

barplot(prop.table(table(df_subset$Hour)), las=2, cex.names=.5)

Correlated Accident Severety with Hour

ggplot(df_subset, aes(Hour,fill =Accident_Severity))+
  geom_bar()+
  theme_bw()+
  geom_bar( position = "fill")+ 
  theme_bw()+
  theme(axis.text.x=element_text(angle=90, hjust=1))

*Note: Large peak of accidents on the afternoon (people going home after work)

Summary, by Day Period

barplot(prop.table(table(df_subset$Day_Period)), las=2, cex.names=.5) ## ok obvious, last info tells 17h

Correlated Accident Severety with Day Period

ggplot(df_subset, aes(Day_Period,fill =Accident_Severity))+
  geom_bar()+
  theme_bw()+
  geom_bar( position = "fill")+
  theme_bw()+
  theme(axis.text.x=element_text(angle=90, hjust=1))

*Note: Large peak of accidents on the afternoon ( with a visible in fata victims (percentage related))

Show Where Occur Accidents (Urban, Rural)

barplot(prop.table(table(df_subset$Urban_or_Rural_Area)), las=2, cex.names=.5)

Correlated Accident severety with Urban or Rural

ggplot(df_subset, aes(Urban_or_Rural_Area,fill =Accident_Severity))+
  geom_bar()+
  theme_bw()+
  geom_bar( position = "fill")+
  theme_bw()+ 
  theme(axis.text.x=element_text(angle=90, hjust=1))

*Note: Large peak of accidents occur on urban scenarios, whoever theres a slight fatality on rural

Most Accidents Occurs Daylight, 17H, lets corroborate.

barplot(prop.table(table(df_subset$Light_Conditions)), las=2, cex.names=.5)

Correlated Accident Severety with Light Conditions

ggplot(df_subset, aes(Light_Conditions,fill =Accident_Severity))+
  geom_bar()+
  theme_bw()+
  geom_bar( position = "fill")+ 
  theme_bw()+
  theme(axis.text.x=element_text(angle=90, hjust=1))

*Note: Large peak of accidents occur on urban scenarios, whoever theres a slight fatality on rural

Show the Fisrt Road Class Number of Accidents

barplot(prop.table(table(df_subset$First_Road_Class)), las=2, cex.names=.5)

Correlated Accident Severety with First road class

ggplot(df_subset, aes(First_Road_Class,fill =Accident_Severity))+
  geom_bar()+
  theme_bw()+
  geom_bar( position = "fill")+
  theme_bw()+
  theme(axis.text.x=element_text(angle=90, hjust=1))

*Note: Large peak of accidents occur on A type roads, other are severl undetermined

Most of Accidents Occur in Uncontrolled Juntions. Lets corraborate

barplot(prop.table(table(df_subset$First_Road_Class)), las=2, cex.names=.5)

Correlated Accident Severety with Junction_Control

ggplot(df_subset, aes(Junction_Control,fill =Accident_Severity))+
  geom_bar()+
  theme_bw()+
  geom_bar( position = "fill")+ 
  theme_bw()+ 
  theme(axis.text.x=element_text(angle=90, hjust=1))

*Note: Large peak of accidents occur when driver must give away priority

Show the Pedestrial Facilities Where Occur More Accidents

barplot(prop.table(table(df_subset$`Pedestrian_Crossing-Physical_Facilities`)), las=2, cex.names=.5)

Correlated Accident Severety with Pedestrian_Crossing-Physical_Facilities

ggplot(df_subset, aes(`Pedestrian_Crossing-Physical_Facilities`,fill =Accident_Severity))+
  geom_bar()+
  theme_bw()+
  geom_bar( position = "fill")+ 
  theme_bw()+ 
  theme(axis.text.x=element_text(angle=90, hjust=1))

*Note: Most pedestriac accidents occur when no crossing infratusture is nearby!!

Show the Histogram in Police Force areas called (most active)

barplot(prop.table(table(df_subset$Police_Force)), las=2, cex.names=.5)

Correlated Accident Severety with Police_Force area

ggplot(df_subset, aes(Police_Force, fill = Accident_Severity))+
  geom_bar()+
  theme_bw()+
  geom_bar( position = "fill")+ 
  theme_bw()+
  theme(axis.text.x=element_text(angle=90, hjust=1))

*Note: metropolitan Police are the most called, obvious, metropolitan have more cars and pedestrians moving!!

Show the Histogram in Police Force areas called (most active)

barplot(prop.table(table(df_subset$Police_Force)), las=2, cex.names=.5)

Show Correlations with Police force area and Weather Conditions with Severety

ggplot(df_subset, aes(Weather_Conditions, fill = Accident_Severity))+
  geom_bar()+
  theme_bw()+
  geom_bar( position = "fill")+  
  theme_bw()+
  theme(axis.text.x=element_text(angle=90, hjust=1))

*Note: Most accicnet occur on metroploita with no severe weather conditions…

3 - Association Rules

Convert Into Transactions (all must be factors!)

Select Some relevant columns tha may be associated with accident values

*Note: Computing this values takes to long, we load them previous

load(file ="df_subset_Ent.Rdata")
load(file ="entropy.gains.Rdata")
load(file ="correlation.filter.selection.Rdata")
library(FSelector)

# Dummy copy
df_rules <- df_model
rm(df_model)

# Create an subset to see the correlation of the variables to give clues about their relevance.
df_rules_ent            <- df_rules[, setdiff( names( df_rules),c( "Longitude",
                                                                 "Latitude",
                                                                 "First_Road_Number",
                                                                 "Second_Road_Number",
                                                                 "Number_of_Casualties" ,
                                                                 "Time",
                                                                 "Date", 
                                                                 "Number_of_Vehicles", 
                                                                 "Hour",
                                                                 "LSOA_of_Accident_Location" ,
                                                                 "Location_Easting_OSGR",
                                                                 "Location_Northing_OSGR" )),
                                  with=F]

# Check the entropy gains regarding the Accident Severity
entropy.gains                  <- information.gain ( Accident_Severity~. , df_rules_ent )

# Compute the corollation
correlation.filter.selection   <- cfs              ( Accident_Severity~. , df_rules_ent )

Show the Variable Selections

# Select the variables to use
variable.use <- unique( c( cutoff.k( entropy.gains, 11 ), correlation.filter.selection ))

# Show the choices
(variable.use)

Create the subset with the entropy suggestions for laler use.

## Create the subset with the entropy suggestions for laler use.
subset_entropy <- df_subset[, c( "Accident_Severity", variable.use ), with=F ]

For our Initial Guess, Lets do a Analisys with Variables We Find Relevant.

df_rules <- df_subset

# Convert the remainder values to factors
df_rules$Season <- as.factor(df_rules$Season)
df_rules$Hour <- as.factor(df_rules$Hour)

# Here we make our initial selection.
subset <- subset(df_rules, select =c("Accident_Severity",
                                      "Speed_limit",
                                      "Light_Conditions",
                                      "Urban_or_Rural_Area",
                                      "Month",
                                      "Pedestrian_Crossing-Physical_Facilities",
                                      "Day_Period",
                                      "Road_Type",
                                      "Road_Surface_Conditions",
                                      "Junction_Detail",
                                      "Junction_Control",
                                      "Casualties_Class",
                                      "Weather_Conditions",
                                      "Season"))

# Convert then to transacions
trans <- as(subset, "transactions")
summary(trans)
## transactions as itemMatrix in sparse format with
##  140056 rows (elements/itemsets/transactions) and
##  85 columns (items) and a density of 0.1647044 
## 
## most frequent items:
##                                                                   Casualties_Class=Small 
##                                                                                   130533 
##                                                                 Accident_Severity=Slight 
##                                                                                   118402 
##                                                    Weather_Conditions=Fine no high winds 
##                                                                                   113949 
## Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres 
##                                                                                   113486 
##                                                             Road_Type=Single carriageway 
##                                                                                   105066 
##                                                                                  (Other) 
##                                                                                  1379330 
## 
## element (itemset/transaction) length distribution:
## sizes
##     13     14 
##     18 140038 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      13      14      14      14      14      14 
## 
## includes extended item information - examples:
##                      labels         variables  levels
## 1   Accident_Severity=Fatal Accident_Severity   Fatal
## 2 Accident_Severity=Serious Accident_Severity Serious
## 3  Accident_Severity=Slight Accident_Severity  Slight
## 
## includes extended transaction information - examples:
##   transactionID
## 1             1
## 2             2
## 3             3
# No longles need subset
rm(subset)

Check Items Labels

# Check some items labels
head(itemLabels(trans))
## [1] "Accident_Severity=Fatal"   "Accident_Severity=Serious"
## [3] "Accident_Severity=Slight"  "Speed_limit=0"            
## [5] "Speed_limit=10"            "Speed_limit=20"

Check Items Frequency

# Check the item frequency (top 25)
itemFrequencyPlot(trans, topN=25,  cex.names=.5)

Lest Look at Some Subsets of Transactions Containing the Itemset of Interest

*Note: Focus in the Accident_Severity=Fatal

trans_sub <- subset(trans, items %in% "Accident_Severity=Fatal")
itemFrequencyPlot(trans_sub, topN = 25, population = trans, cex.names=.5)

Order Itemset by Lift

# Order By lift
itemFrequencyPlot(trans_sub, topN = 25, population = trans, lift=TRUE, cex.names=.5)

*Note: The graph manifest that the Accident_Severity=Fatal appear frequently as the no physical crossing facilities within 50 meters also dry conditions and rural areas.

Focus in the Light_Conditions=Darkness - lights lit

trans_sub <- subset(trans, items %in% "Light_Conditions=Darkness - lights lit")
itemFrequencyPlot(trans_sub, topN = 25, population = trans, cex.names=.5)

Order By lift

itemFrequencyPlot(trans_sub, topN = 25, population = trans, lift=TRUE, cex.names=.5)

*Note: The graph manifest that the light conditions - Darkness lights lit Urbarn_or_Rural_Area=Urban and Speed_limit=30 appear frequently, what makes sense to a urban city area.

Focus in the Road_Surface_Conditions=Wet or damp

trans_sub <- subset(trans, items %in% "Road_Surface_Conditions=Wet or damp")
itemFrequencyPlot(trans_sub, topN = 25, population = trans, cex.names=.5)

Order By lift

itemFrequencyPlot(trans_sub, topN = 25, population = trans, lift=TRUE, cex.names=.5)

*Note: The graph manifest that the the Road_Surface_Conditions=Wet_or_damp apper frequently with Road_Type=Single Carriageway. Also ordering by lift we may find a strong co-occurence of Surface_Conditions=Wet or damp with the winter months (Nov, Dec, Jan) what is obvious.

Focus in the Day_Period=Night

trans_sub <- subset(trans, items %in% "Day_Period=Night")
itemFrequencyPlot(trans_sub, topN = 25, population = trans, cex.names=.5)

Order By lift

itemFrequencyPlot(trans_sub, topN = 25, population = trans, lift=TRUE, cex.names=.5)

*Note: Analysing the lift graph, we find that the Day_period=Night co-occcur toguether with Light_Conditions=Darkneess - lights unlit and Darkness -no Lighting and we also may find a small co-occurence of the fatal accidents severety. The lift value is not very high.

Lets Mining Frequent Items

# Drop trans_sub so save memory
rm(trans_sub)

# Find an interesting support (have at least 500 transactions)
(500/nrow(trans))
## [1] 0.003570001

Lets Mining Frequent Items

# User parameters
min_supp <- 0.0036
min_conf <- 0.5
min_lift <- 0.6
min_len <- 2
max_len <- 4

## Generate frequent itemsets
itemsets <- apriori(trans, parameter = list(target = "frequent",
  supp=min_supp, minlen = min_len, maxlen =max_len))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##          NA    0.1    1 none FALSE            TRUE       5  0.0036      2
##  maxlen            target  ext
##       4 frequent itemsets TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 504 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[85 item(s), 140056 transaction(s)] done [0.10s].
## sorting and recoding items ... [75 item(s)] done [0.02s].
## creating transaction tree ... done [0.08s].
## checking subsets of size 1 2 3 4 done [0.34s].
## sorting transactions ... done [0.04s].
## writing ... [53285 set(s)] done [0.01s].
## creating S4 object  ... done [0.05s].

Inspect the 10 most Frequent Items

inspect(head(sort(itemsets), n=10))
##      items                                                                                        support transIdenticalToItemsets  count
## [1]  {Accident_Severity=Slight,                                                                                                          
##       Casualties_Class=Small}                                                                   0.7925116                        0 110996
## [2]  {Casualties_Class=Small,                                                                                                            
##       Weather_Conditions=Fine no high winds}                                                    0.7590892                        0 106315
## [3]  {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,                                          
##       Casualties_Class=Small}                                                                   0.7518707                        0 105304
## [4]  {Road_Type=Single carriageway,                                                                                                      
##       Casualties_Class=Small}                                                                   0.7022405                        0  98353
## [5]  {Road_Surface_Conditions=Dry,                                                                                                       
##       Weather_Conditions=Fine no high winds}                                                    0.6947650                        0  97306
## [6]  {Accident_Severity=Slight,                                                                                                          
##       Weather_Conditions=Fine no high winds}                                                    0.6863397                        0  96126
## [7]  {Light_Conditions=Daylight,                                                                                                         
##       Casualties_Class=Small}                                                                   0.6847547                        0  95904
## [8]  {Accident_Severity=Slight,                                                                                                          
##       Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres} 0.6813489                        0  95427
## [9]  {Road_Surface_Conditions=Dry,                                                                                                       
##       Casualties_Class=Small}                                                                   0.6765080                        0  94749
## [10] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,                                          
##       Weather_Conditions=Fine no high winds}                                                    0.6569801                        0  92014

*Note: Here we can see that the majority of the accidents severety are Slight and the number of casualties small.

Another info is that the small casuality occur mostly without existence of any pedestrian physiscal facilities.

Another info is that the majority of the small accidents occur in the Single carriage road type.

Adding a Extra Measure of Quality

quality(itemsets)$lift <- interestMeasure(itemsets, measure="lift", trans = trans)

## Lets analyse
inspect(head(sort(itemsets, by = "lift"), n=10))
##      items                                                                                     support transIdenticalToItemsets count     lift
## [1]  {Speed_limit=70,                                                                                                                         
##       Urban_or_Rural_Area=Rural,                                                                                                              
##       Road_Type=Dual carriageway,                                                                                                             
##       Junction_Detail=Slip road}                                                           0.004341121                        0   608 87.83085
## [2]  {Speed_limit=70,                                                                                                                         
##       Road_Type=Dual carriageway,                                                                                                             
##       Junction_Detail=Slip road,                                                                                                              
##       Junction_Control=Give way or uncontrolled}                                           0.005255041                        0   736 74.29733
## [3]  {Light_Conditions=Darkness - lights lit,                                                                                                 
##       Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,                                                    
##       Junction_Detail=Crossroads,                                                                                                             
##       Junction_Control=Auto traffic signal}                                                0.011345462                        0  1589 73.30477
## [4]  {Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,                                                    
##       Road_Type=Dual carriageway,                                                                                                             
##       Junction_Detail=Crossroads,                                                                                                             
##       Junction_Control=Auto traffic signal}                                                0.007775461                        0  1089 68.29762
## [5]  {Urban_or_Rural_Area=Urban,                                                                                                              
##       Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,                                                    
##       Junction_Detail=Crossroads,                                                                                                             
##       Junction_Control=Auto traffic signal}                                                0.033279545                        0  4661 64.33681
## [6]  {Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,                                                    
##       Day_Period=Evening,                                                                                                                     
##       Junction_Detail=Crossroads,                                                                                                             
##       Junction_Control=Auto traffic signal}                                                0.007554121                        0  1058 58.95044
## [7]  {Speed_limit=30,                                                                                                                         
##       Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,                                                    
##       Junction_Detail=Crossroads,                                                                                                             
##       Junction_Control=Auto traffic signal}                                                0.029566745                        0  4141 58.72352
## [8]  {Month=Nov,                                                                                                                              
##       Road_Surface_Conditions=Wet or damp,                                                                                                    
##       Weather_Conditions=Raining + high winds,                                                                                                
##       Season=Fall}                                                                         0.005119381                        0   717 49.62872
## [9]  {Speed_limit=70,                                                                                                                         
##       Light_Conditions=Darkness - no lighting,                                                                                                
##       Urban_or_Rural_Area=Rural,                                                                                                              
##       Road_Type=Dual carriageway}                                                          0.008717941                        0  1221 49.38169
## [10] {Speed_limit=40,                                                                                                                         
##       Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,                                                    
##       Road_Type=Dual carriageway,                                                                                                             
##       Junction_Control=Auto traffic signal}                                                0.004676701                        0   655 47.89292

Lets plot

plot(head(sort(itemsets_subset, by = "lift"), n=10), method = "graph", control=list(cex=.8))

*Note: A quick analisys shows that Accident_Severity=Fatal, Light_Conditions=Darkness - lights lit, Road_Type=Roundabout, Junction_Detail=Roundabout have an reasonable lift suggesting that they Co-occurr together, what makes sense regarding the fact that suggest a city crossing, (City))

Subset the Overwellming class

# Generate the subset transactions containing only overwellming number casualties
trans_sub <- subset(trans, items %in% "Casualties_Class=Overweelming")

# Generate the itemsets
itemsets_subset <- apriori(trans_sub, parameter = list(target = "frequent",
                                                       supp=min_supp, minlen = min_len, maxlen=max_len))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##          NA    0.1    1 none FALSE            TRUE       5  0.0036      2
##  maxlen            target  ext
##       4 frequent itemsets TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 0 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[52 item(s), 18 transaction(s)] done [0.00s].
## sorting and recoding items ... [52 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## sorting transactions ... done [0.00s].
## writing ... [16234 set(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

Inspect the 10 most frequent item

inspect(head(sort(itemsets_subset), n=10))
##      items                                                                                        support transIdenticalToItemsets count
## [1]  {Light_Conditions=Daylight,                                                                                                        
##       Casualties_Class=Overweelming}                                                            0.9444444                        0    17
## [2]  {Road_Surface_Conditions=Dry,                                                                                                      
##       Casualties_Class=Overweelming}                                                            0.8888889                        0    16
## [3]  {Road_Surface_Conditions=Dry,                                                                                                      
##       Weather_Conditions=Fine no high winds}                                                    0.8333333                        0    15
## [4]  {Casualties_Class=Overweelming,                                                                                                    
##       Weather_Conditions=Fine no high winds}                                                    0.8333333                        0    15
## [5]  {Light_Conditions=Daylight,                                                                                                        
##       Road_Surface_Conditions=Dry}                                                              0.8333333                        0    15
## [6]  {Road_Surface_Conditions=Dry,                                                                                                      
##       Casualties_Class=Overweelming,                                                                                                    
##       Weather_Conditions=Fine no high winds}                                                    0.8333333                        0    15
## [7]  {Light_Conditions=Daylight,                                                                                                        
##       Road_Surface_Conditions=Dry,                                                                                                      
##       Casualties_Class=Overweelming}                                                            0.8333333                        0    15
## [8]  {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,                                         
##       Casualties_Class=Overweelming}                                                            0.7777778                        0    14
## [9]  {Light_Conditions=Daylight,                                                                                                        
##       Weather_Conditions=Fine no high winds}                                                    0.7777778                        0    14
## [10] {Light_Conditions=Daylight,                                                                                                        
##       Road_Surface_Conditions=Dry,                                                                                                      
##       Weather_Conditions=Fine no high winds}                                                    0.7777778                        0    14

Attain only those with large number of casualties involved

itemsets_subset <- subset(itemsets_subset, subset = items %in% "Casualties_Class=Overweelming")

quality(itemsets_subset)$lift <- interestMeasure(itemsets_subset,
                                                 measure="lift", trans = trans_sub)

# Lets analyse
inspect(head(sort(itemsets_subset, by = "lift"), n=10))
##      items                                                                                    support transIdenticalToItemsets count lift
## [1]  {Speed_limit=60,                                                                                                                    
##       Road_Type=Roundabout,                                                                                                              
##       Junction_Detail=Roundabout,                                                                                                        
##       Casualties_Class=Overweelming}                                                       0.05555556                        0     1  324
## [2]  {Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,                                               
##       Junction_Detail=More than 4 arms (not roundabout),                                                                                 
##       Casualties_Class=Overweelming,                                                                                                     
##       Weather_Conditions=Raining no high winds}                                            0.05555556                        0     1  324
## [3]  {Accident_Severity=Fatal,                                                                                                           
##       Month=Feb,                                                                                                                         
##       Casualties_Class=Overweelming,                                                                                                     
##       Weather_Conditions=Fog or mist}                                                      0.05555556                        0     1  324
## [4]  {Light_Conditions=Darkness - lights lit,                                                                                            
##       Day_Period=Night,                                                                                                                  
##       Road_Type=One way street,                                                                                                          
##       Casualties_Class=Overweelming}                                                       0.05555556                        0     1  324
## [5]  {Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,                                               
##       Road_Surface_Conditions=Wet or damp,                                                                                               
##       Junction_Detail=More than 4 arms (not roundabout),                                                                                 
##       Casualties_Class=Overweelming}                                                       0.05555556                        0     1  162
## [6]  {Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,                                               
##       Junction_Detail=More than 4 arms (not roundabout),                                                                                 
##       Junction_Control=Auto traffic signal,                                                                                              
##       Casualties_Class=Overweelming}                                                       0.05555556                        0     1  162
## [7]  {Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,                                               
##       Road_Surface_Conditions=Wet or damp,                                                                                               
##       Casualties_Class=Overweelming,                                                                                                     
##       Weather_Conditions=Raining no high winds}                                            0.05555556                        0     1  162
## [8]  {Pedestrian_Crossing-Physical_Facilities=Pedestrian phase at traffic signal junction,                                               
##       Junction_Control=Auto traffic signal,                                                                                              
##       Casualties_Class=Overweelming,                                                                                                     
##       Weather_Conditions=Raining no high winds}                                            0.05555556                        0     1  162
## [9]  {Road_Surface_Conditions=Wet or damp,                                                                                               
##       Junction_Detail=More than 4 arms (not roundabout),                                                                                 
##       Casualties_Class=Overweelming,                                                                                                     
##       Weather_Conditions=Raining no high winds}                                            0.05555556                        0     1  162
## [10] {Junction_Detail=More than 4 arms (not roundabout),                                                                                 
##       Junction_Control=Auto traffic signal,                                                                                              
##       Casualties_Class=Overweelming,                                                                                                     
##       Weather_Conditions=Raining no high winds}                                            0.05555556                        0     1  162

Lets plot

plot(head(sort(itemsets_subset, by = "lift"), n=10), method = "graph", control=list(cex=.8))

*Note: {Accident_Severity=Fatal, Month=feb., Casualties_Class=Overweelming,Weather_Conditions=Fog or mist} We may find that fog, february, Fatal accidnet and number of casuaties high co-occur frequently. An typical conditions to have major accidents

Mine Association Rules

rules <- apriori(trans, parameter = list(supp=min_supp, maxlen=max_len, target="rules"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5  0.0036      1
##  maxlen target  ext
##       4  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 504 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[85 item(s), 140056 transaction(s)] done [0.09s].
## sorting and recoding items ... [75 item(s)] done [0.02s].
## creating transaction tree ... done [0.08s].
## checking subsets of size 1 2 3 4 done [0.32s].
## writing ... [52078 rule(s)] done [0.01s].
## creating S4 object  ... done [0.05s].

Quick inspection

inspect(head(sort(rules, by="lift"), n=10))
##      lhs                                                                                           rhs                                           support confidence    coverage     lift count
## [1]  {Speed_limit=60,                                                                                                                                                                         
##       Day_Period=Evening,                                                                                                                                                                     
##       Season=Fall}                                                                              => {Light_Conditions=Darkness - no lighting} 0.004184041  0.8060523 0.005190781 15.55421   586
## [2]  {Light_Conditions=Darkness - no lighting,                                                                                                                                                
##       Road_Type=Dual carriageway,                                                                                                                                                             
##       Junction_Detail=Not at junction or within 20 metres}                                      => {Speed_limit=70}                          0.008418061  0.9013761 0.009339121 13.11481  1179
## [3]  {Light_Conditions=Darkness - no lighting,                                                                                                                                                
##       Road_Type=Dual carriageway,                                                                                                                                                             
##       Junction_Control=Data missing or out of range}                                            => {Speed_limit=70}                          0.008418061  0.9013761 0.009339121 13.11481  1179
## [4]  {Accident_Severity=Slight,                                                                                                                                                               
##       Light_Conditions=Darkness - no lighting,                                                                                                                                                
##       Road_Type=Dual carriageway}                                                               => {Speed_limit=70}                          0.007475581  0.9010327 0.008296681 13.10981  1047
## [5]  {Light_Conditions=Darkness - no lighting,                                                                                                                                                
##       Urban_or_Rural_Area=Rural,                                                                                                                                                              
##       Road_Type=Dual carriageway}                                                               => {Speed_limit=70}                          0.008717941  0.8931968 0.009760382 12.99580  1221
## [6]  {Light_Conditions=Darkness - no lighting,                                                                                                                                                
##       Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,                                                                                               
##       Road_Type=Dual carriageway}                                                               => {Speed_limit=70}                          0.009446222  0.8921106 0.010588622 12.98000  1323
## [7]  {Light_Conditions=Darkness - no lighting,                                                                                                                                                
##       Road_Type=Dual carriageway,                                                                                                                                                             
##       Weather_Conditions=Fine no high winds}                                                    => {Speed_limit=70}                          0.006326041  0.8913481 0.007097161 12.96890   886
## [8]  {Light_Conditions=Darkness - no lighting,                                                                                                                                                
##       Road_Type=Dual carriageway,                                                                                                                                                             
##       Road_Surface_Conditions=Dry}                                                              => {Speed_limit=70}                          0.004619581  0.8911846 0.005183641 12.96652   647
## [9]  {Light_Conditions=Darkness - no lighting,                                                                                                                                                
##       Road_Type=Dual carriageway}                                                               => {Speed_limit=70}                          0.009474782  0.8900067 0.010645742 12.94938  1327
## [10] {Light_Conditions=Darkness - no lighting,                                                                                                                                                
##       Road_Type=Dual carriageway,                                                                                                                                                             
##       Season=Fall}                                                                              => {Speed_limit=70}                          0.003955561  0.8892456 0.004448221 12.93831   554

Plot the scatterplot

plot(rules)

Lower the support value

(200/nrow(trans))
## [1] 0.001428
min_supp = 0.0015

# Generate the items
rules <- apriori(trans, parameter = list(supp=min_supp, maxlen=max_len, target ="rules"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5  0.0015      1
##  maxlen target  ext
##       4  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 210 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[85 item(s), 140056 transaction(s)] done [0.09s].
## sorting and recoding items ... [77 item(s)] done [0.02s].
## creating transaction tree ... done [0.08s].
## checking subsets of size 1 2 3 4 done [0.32s].
## writing ... [75935 rule(s)] done [0.01s].
## creating S4 object  ... done [0.05s].

Quick inspection

inspect(head(sort(rules, by="lift"), n=10))
##      lhs                                                      rhs                                           support confidence    coverage     lift count
## [1]  {Speed_limit=60,                                                                                                                                    
##       Month=Oct,                                                                                                                                         
##       Day_Period=Evening}                                  => {Light_Conditions=Darkness - no lighting} 0.001599360  0.8296296 0.001927800 16.00918   224
## [2]  {Speed_limit=60,                                                                                                                                    
##       Day_Period=Evening,                                                                                                                                
##       Season=Fall}                                         => {Light_Conditions=Darkness - no lighting} 0.004184041  0.8060523 0.005190781 15.55421   586
## [3]  {Light_Conditions=Darkness - no lighting,                                                                                                           
##       Month=Dec,                                                                                                                                         
##       Road_Type=Dual carriageway}                          => {Speed_limit=70}                          0.001563660  0.9201681 0.001699320 13.38823   219
## [4]  {Light_Conditions=Darkness - no lighting,                                                                                                           
##       Road_Type=Dual carriageway,                                                                                                                        
##       Season=Winter}                                       => {Speed_limit=70}                          0.003420061  0.9037736 0.003784201 13.14969   479
## [5]  {Light_Conditions=Darkness - no lighting,                                                                                                           
##       Day_Period=Night,                                                                                                                                  
##       Road_Type=Dual carriageway}                          => {Speed_limit=70}                          0.002941680  0.9035088 0.003255841 13.14584   412
## [6]  {Light_Conditions=Darkness - no lighting,                                                                                                           
##       Road_Type=Dual carriageway,                                                                                                                        
##       Junction_Detail=Not at junction or within 20 metres} => {Speed_limit=70}                          0.008418061  0.9013761 0.009339121 13.11481  1179
## [7]  {Light_Conditions=Darkness - no lighting,                                                                                                           
##       Road_Type=Dual carriageway,                                                                                                                        
##       Junction_Control=Data missing or out of range}       => {Speed_limit=70}                          0.008418061  0.9013761 0.009339121 13.11481  1179
## [8]  {Accident_Severity=Slight,                                                                                                                          
##       Light_Conditions=Darkness - no lighting,                                                                                                           
##       Road_Type=Dual carriageway}                          => {Speed_limit=70}                          0.007475581  0.9010327 0.008296681 13.10981  1047
## [9]  {Light_Conditions=Darkness - no lighting,                                                                                                           
##       Day_Period=Afternoon,                                                                                                                              
##       Road_Type=Dual carriageway}                          => {Speed_limit=70}                          0.002606100  0.8990148 0.002898840 13.08045   365
## [10] {Light_Conditions=Darkness - no lighting,                                                                                                           
##       Month=Jan,                                                                                                                                         
##       Road_Type=Dual carriageway}                          => {Speed_limit=70}                          0.001549380  0.8966942 0.001727880 13.04669   217

Plot the scatterplot

plot(rules)

*Note: We may find some rules that area obvious to us such as Speed_limit=60, Month=Nov, Day_Period=Evening} => {Light_Conditions=Darkness - no lighting}. In order to obtain more relevant ones we gona focus in the cases that acciddents are fatal.

Subset the Accident_Severity=Fatal

r_subset <- subset(rules, subset = items %in% "Accident_Severity=Fatal")

## Inspect
inspect(head(sort(r_subset, by="lift"), 10))
##      lhs                                                                                           rhs                             support confidence    coverage     lift count
## [1]  {Accident_Severity=Fatal,                                                                                                                                                  
##       Speed_limit=60,                                                                                                                                                           
##       Day_Period=Afternoon}                                                                     => {Urban_or_Rural_Area=Rural} 0.001685040  0.9915966 0.001699320 2.894761   236
## [2]  {Accident_Severity=Fatal,                                                                                                                                                  
##       Speed_limit=60,                                                                                                                                                           
##       Light_Conditions=Daylight}                                                                => {Urban_or_Rural_Area=Rural} 0.002770320  0.9897959 0.002798880 2.889504   388
## [3]  {Accident_Severity=Fatal,                                                                                                                                                  
##       Speed_limit=60,                                                                                                                                                           
##       Road_Type=Single carriageway}                                                             => {Urban_or_Rural_Area=Rural} 0.003855601  0.9854015 0.003912721 2.876676   540
## [4]  {Accident_Severity=Fatal,                                                                                                                                                  
##       Speed_limit=60,                                                                                                                                                           
##       Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres} => {Urban_or_Rural_Area=Rural} 0.004019821  0.9842657 0.004084081 2.873360   563
## [5]  {Accident_Severity=Fatal,                                                                                                                                                  
##       Speed_limit=60,                                                                                                                                                           
##       Road_Surface_Conditions=Dry}                                                              => {Urban_or_Rural_Area=Rural} 0.002727480  0.9820051 0.002777460 2.866761   382
## [6]  {Accident_Severity=Fatal,                                                                                                                                                  
##       Speed_limit=60,                                                                                                                                                           
##       Junction_Control=Data missing or out of range}                                            => {Urban_or_Rural_Area=Rural} 0.003113040  0.9819820 0.003170161 2.866693   436
## [7]  {Accident_Severity=Fatal,                                                                                                                                                  
##       Speed_limit=60,                                                                                                                                                           
##       Junction_Detail=Not at junction or within 20 metres}                                      => {Urban_or_Rural_Area=Rural} 0.003105900  0.9819413 0.003163021 2.866574   435
## [8]  {Accident_Severity=Fatal,                                                                                                                                                  
##       Speed_limit=60,                                                                                                                                                           
##       Weather_Conditions=Fine no high winds}                                                    => {Urban_or_Rural_Area=Rural} 0.003320101  0.9810127 0.003384361 2.863863   465
## [9]  {Accident_Severity=Fatal,                                                                                                                                                  
##       Speed_limit=60}                                                                           => {Urban_or_Rural_Area=Rural} 0.004034101  0.9809028 0.004112641 2.863543   565
## [10] {Accident_Severity=Fatal,                                                                                                                                                  
##       Speed_limit=60,                                                                                                                                                           
##       Casualties_Class=Small}                                                                   => {Urban_or_Rural_Area=Rural} 0.002984520  0.9766355 0.003055920 2.851085   418

Lets plot

itemFrequencyPlot(items(r_subset), topN=10, cex.names=.5)

Plot graph by lift

plot(head(sort(r_subset, by="lift"), 10),
  method="graph", control=list(cex=.7))

*Note: We may find that {Accident_Severity=Fatal, Speed_limit=60, Light_Conditions=Daylight} => {Urban_or_Rural_Area=Rural} Co-occur together the lift is not very high, but is bigger than one what may suggest that they are positive co-related.

Subset by Light conditions to find some Patterns

r_subset <- subset(rules, subset = items %in% "Light_Conditions=Darkness - no lighting")

## Inspect
inspect(head(sort(r_subset, by="lift"), n=10))
##      lhs                                                      rhs                                           support confidence    coverage     lift count
## [1]  {Speed_limit=60,                                                                                                                                    
##       Month=Oct,                                                                                                                                         
##       Day_Period=Evening}                                  => {Light_Conditions=Darkness - no lighting} 0.001599360  0.8296296 0.001927800 16.00918   224
## [2]  {Speed_limit=60,                                                                                                                                    
##       Day_Period=Evening,                                                                                                                                
##       Season=Fall}                                         => {Light_Conditions=Darkness - no lighting} 0.004184041  0.8060523 0.005190781 15.55421   586
## [3]  {Light_Conditions=Darkness - no lighting,                                                                                                           
##       Month=Dec,                                                                                                                                         
##       Road_Type=Dual carriageway}                          => {Speed_limit=70}                          0.001563660  0.9201681 0.001699320 13.38823   219
## [4]  {Light_Conditions=Darkness - no lighting,                                                                                                           
##       Road_Type=Dual carriageway,                                                                                                                        
##       Season=Winter}                                       => {Speed_limit=70}                          0.003420061  0.9037736 0.003784201 13.14969   479
## [5]  {Light_Conditions=Darkness - no lighting,                                                                                                           
##       Day_Period=Night,                                                                                                                                  
##       Road_Type=Dual carriageway}                          => {Speed_limit=70}                          0.002941680  0.9035088 0.003255841 13.14584   412
## [6]  {Light_Conditions=Darkness - no lighting,                                                                                                           
##       Road_Type=Dual carriageway,                                                                                                                        
##       Junction_Detail=Not at junction or within 20 metres} => {Speed_limit=70}                          0.008418061  0.9013761 0.009339121 13.11481  1179
## [7]  {Light_Conditions=Darkness - no lighting,                                                                                                           
##       Road_Type=Dual carriageway,                                                                                                                        
##       Junction_Control=Data missing or out of range}       => {Speed_limit=70}                          0.008418061  0.9013761 0.009339121 13.11481  1179
## [8]  {Accident_Severity=Slight,                                                                                                                          
##       Light_Conditions=Darkness - no lighting,                                                                                                           
##       Road_Type=Dual carriageway}                          => {Speed_limit=70}                          0.007475581  0.9010327 0.008296681 13.10981  1047
## [9]  {Light_Conditions=Darkness - no lighting,                                                                                                           
##       Day_Period=Afternoon,                                                                                                                              
##       Road_Type=Dual carriageway}                          => {Speed_limit=70}                          0.002606100  0.8990148 0.002898840 13.08045   365
## [10] {Light_Conditions=Darkness - no lighting,                                                                                                           
##       Month=Jan,                                                                                                                                         
##       Road_Type=Dual carriageway}                          => {Speed_limit=70}                          0.001549380  0.8966942 0.001727880 13.04669   217

Plot itemfrequency

itemFrequencyPlot(items(r_subset), topN=10, cex.names=0.5)

Show graph

plot(head(sort(r_subset, by="lift"), 10),
  method="graph", control=list(cex=.7))

*Note: From an quick analisys, Speed_limit=60, Month=Nov, Day_Period=Evening} => {Light_Conditions=Darkness - no lighting} is a coerent rule but not very usefull.

##Subset by Road Surface conditions

r_subset <- subset(rules, subset = items %in% "Road_Surface_Conditions=Wet or damp")

# Inspect
inspect(head(sort(r_subset, by="lift"), n=10))
##      lhs                                          rhs                              support confidence    coverage      lift count
## [1]  {Light_Conditions=Darkness - no lighting,                                                                                   
##       Road_Type=Dual carriageway,                                                                                                
##       Road_Surface_Conditions=Wet or damp}     => {Speed_limit=70}             0.004333981  0.8861314 0.004890901 12.893000   607
## [2]  {Road_Type=Roundabout,                                                                                                      
##       Road_Surface_Conditions=Wet or damp,                                                                                       
##       Junction_Control=Auto traffic signal}    => {Junction_Detail=Roundabout} 0.001535100  0.9862385 0.001556520 11.118782   215
## [3]  {Speed_limit=40,                                                                                                            
##       Road_Type=Roundabout,                                                                                                      
##       Road_Surface_Conditions=Wet or damp}     => {Junction_Detail=Roundabout} 0.002763180  0.9748111 0.002834580 10.989949   387
## [4]  {Speed_limit=60,                                                                                                            
##       Road_Type=Roundabout,                                                                                                      
##       Road_Surface_Conditions=Wet or damp}     => {Junction_Detail=Roundabout} 0.001906380  0.9709091 0.001963500 10.945959   267
## [5]  {Urban_or_Rural_Area=Rural,                                                                                                 
##       Road_Type=Roundabout,                                                                                                      
##       Road_Surface_Conditions=Wet or damp}     => {Junction_Detail=Roundabout} 0.006097561  0.9467849 0.006440281 10.673984   854
## [6]  {Road_Type=Roundabout,                                                                                                      
##       Road_Surface_Conditions=Wet or damp,                                                                                       
##       Season=Summer}                           => {Junction_Detail=Roundabout} 0.002984520  0.9186813 0.003248701 10.357146   418
## [7]  {Day_Period=Night,                                                                                                          
##       Road_Type=Roundabout,                                                                                                      
##       Road_Surface_Conditions=Wet or damp}     => {Junction_Detail=Roundabout} 0.001842120  0.9148936 0.002013480 10.314444   258
## [8]  {Month=Jan,                                                                                                                 
##       Road_Type=Roundabout,                                                                                                      
##       Road_Surface_Conditions=Wet or damp}     => {Junction_Detail=Roundabout} 0.002370480  0.8997290 0.002634660 10.143479   332
## [9]  {Road_Type=Roundabout,                                                                                                      
##       Road_Surface_Conditions=Wet or damp,                                                                                       
##       Weather_Conditions=Fine no high winds}   => {Junction_Detail=Roundabout} 0.006561661  0.8896418 0.007375621 10.029757   919
## [10] {Day_Period=Afternoon,                                                                                                      
##       Road_Type=Roundabout,                                                                                                      
##       Road_Surface_Conditions=Wet or damp}     => {Junction_Detail=Roundabout} 0.005776261  0.8860898 0.006518821  9.989712   809

Plot itemfrequency

itemFrequencyPlot(items(r_subset), topN=10, cex.names=0.5) #(Too big to plot)

Show graph

plot(head(sort(r_subset, by="lift"), 10),
  method="graph", control=list(cex=.7))

*Note: We may find that Light_Conditions=Darkness - no lighting, Road_Type=Dual carriageway, Road_Surface_Conditions=Wet or damp} => {Speed_limit=70} have an lift value > 1 suggesting that they co-occur together (of course wet conditions and Darkness leads to more accidents).

Subset by road type.

r_subset <- subset(rules, subset = items %in% "Road_Type=Dual carriageway")

# Inspect
inspect(head(sort(r_subset, by="lift"), n=10))
##      lhs                                                                                           rhs                  support confidence    coverage     lift count
## [1]  {Light_Conditions=Darkness - no lighting,                                                                                                                       
##       Month=Dec,                                                                                                                                                     
##       Road_Type=Dual carriageway}                                                               => {Speed_limit=70} 0.001563660  0.9201681 0.001699320 13.38823   219
## [2]  {Light_Conditions=Darkness - no lighting,                                                                                                                       
##       Road_Type=Dual carriageway,                                                                                                                                    
##       Season=Winter}                                                                            => {Speed_limit=70} 0.003420061  0.9037736 0.003784201 13.14969   479
## [3]  {Light_Conditions=Darkness - no lighting,                                                                                                                       
##       Day_Period=Night,                                                                                                                                              
##       Road_Type=Dual carriageway}                                                               => {Speed_limit=70} 0.002941680  0.9035088 0.003255841 13.14584   412
## [4]  {Light_Conditions=Darkness - no lighting,                                                                                                                       
##       Road_Type=Dual carriageway,                                                                                                                                    
##       Junction_Detail=Not at junction or within 20 metres}                                      => {Speed_limit=70} 0.008418061  0.9013761 0.009339121 13.11481  1179
## [5]  {Light_Conditions=Darkness - no lighting,                                                                                                                       
##       Road_Type=Dual carriageway,                                                                                                                                    
##       Junction_Control=Data missing or out of range}                                            => {Speed_limit=70} 0.008418061  0.9013761 0.009339121 13.11481  1179
## [6]  {Accident_Severity=Slight,                                                                                                                                      
##       Light_Conditions=Darkness - no lighting,                                                                                                                       
##       Road_Type=Dual carriageway}                                                               => {Speed_limit=70} 0.007475581  0.9010327 0.008296681 13.10981  1047
## [7]  {Light_Conditions=Darkness - no lighting,                                                                                                                       
##       Day_Period=Afternoon,                                                                                                                                          
##       Road_Type=Dual carriageway}                                                               => {Speed_limit=70} 0.002606100  0.8990148 0.002898840 13.08045   365
## [8]  {Light_Conditions=Darkness - no lighting,                                                                                                                       
##       Month=Jan,                                                                                                                                                     
##       Road_Type=Dual carriageway}                                                               => {Speed_limit=70} 0.001549380  0.8966942 0.001727880 13.04669   217
## [9]  {Light_Conditions=Darkness - no lighting,                                                                                                                       
##       Urban_or_Rural_Area=Rural,                                                                                                                                     
##       Road_Type=Dual carriageway}                                                               => {Speed_limit=70} 0.008717941  0.8931968 0.009760382 12.99580  1221
## [10] {Light_Conditions=Darkness - no lighting,                                                                                                                       
##       Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,                                                                      
##       Road_Type=Dual carriageway}                                                               => {Speed_limit=70} 0.009446222  0.8921106 0.010588622 12.98000  1323

Plot itemfrequency

itemFrequencyPlot(items(r_subset), topN=10,  cex.names=0.5) #(Too big to plot)

Show graph

plot(head(sort(r_subset, by="lift"), 10),
  method="graph", control=list(cex=.7))

*Note: We may find the rule {Light_Conditions=Darkness - no lighting, Road_Type=Dual carriageway, Casualties_Class=Medium} => {Speed_limit=70} contains a co-occurence of high number of car involved. Clearely a auto -way.

From our generated rules lets find out those with more conviction (High Values of conviction (indicates that the consequent depends strongly in the antecendent)

rules_convi <- cbind(as(rules, "data.frame"),
                              conviction=interestMeasure(rules,
                                                         "conviction", trans))
# Order by descreasing value of conviction
rules_convi <- rules_convi[order(rules_convi$conviction),]

# Get only the top ones%
per <-0.0005 # define percentage
top_rules <- head(rules_convi[order(rules_convi$conviction, decreasing = T),], per*nrow(rules_convi))

# Show then (they are ordered by conviction)
(top_rules$rules)
##  [1] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,Road_Type=Single carriageway,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}         
##  [2] {Accident_Severity=Slight,Road_Type=Single carriageway,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                         
##  [3] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                      
##  [4] {Speed_limit=30,Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                       
##  [5] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,Junction_Detail=Not at junction or within 20 metres,Casualties_Class=Small} => {Junction_Control=Data missing or out of range}               
##  [6] {Accident_Severity=Slight,Speed_limit=30,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                                       
##  [7] {Accident_Severity=Slight,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                                                      
##  [8] {Accident_Severity=Slight,Junction_Detail=Not at junction or within 20 metres,Casualties_Class=Small} => {Junction_Control=Data missing or out of range}                                                                               
##  [9] {Road_Type=Single carriageway,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                                                  
## [10] {Accident_Severity=Slight,Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}             
## [11] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,Junction_Detail=Not at junction or within 20 metres,Weather_Conditions=Fine no high winds} => {Junction_Control=Data missing or out of range}
## [12] {Road_Type=Single carriageway,Junction_Detail=Not at junction or within 20 metres,Casualties_Class=Small} => {Junction_Control=Data missing or out of range}                                                                           
## [13] {Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                                                                               
## [14] {Day_Period=Afternoon,Road_Type=Single carriageway,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                             
## [15] {Light_Conditions=Daylight,Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}            
## [16] {Accident_Severity=Slight,Junction_Detail=Not at junction or within 20 metres,Weather_Conditions=Fine no high winds} => {Junction_Control=Data missing or out of range}                                                                
## [17] {Day_Period=Morning,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                                                            
## [18] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,Road_Surface_Conditions=Dry,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}          
## [19] {Junction_Detail=Not at junction or within 20 metres,Casualties_Class=Small} => {Junction_Control=Data missing or out of range}                                                                                                        
## [20] {Light_Conditions=Daylight,Day_Period=Morning,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                                  
## [21] {Road_Type=Single carriageway,Junction_Detail=Not at junction or within 20 metres,Weather_Conditions=Fine no high winds} => {Junction_Control=Data missing or out of range}                                                            
## [22] {Accident_Severity=Slight,Light_Conditions=Daylight,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                            
## [23] {Day_Period=Morning,Junction_Detail=Not at junction or within 20 metres,Casualties_Class=Small} => {Junction_Control=Data missing or out of range}                                                                                     
## [24] {Accident_Severity=Slight,Road_Surface_Conditions=Dry,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                          
## [25] {Light_Conditions=Daylight,Road_Type=Single carriageway,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                        
## [26] {Junction_Detail=Not at junction or within 20 metres,Weather_Conditions=Fine no high winds} => {Junction_Control=Data missing or out of range}                                                                                         
## [27] {Road_Type=Single carriageway,Road_Surface_Conditions=Dry,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                      
## [28] {Day_Period=Morning,Junction_Detail=Not at junction or within 20 metres,Weather_Conditions=Fine no high winds} => {Junction_Control=Data missing or out of range}                                                                      
## [29] {Speed_limit=30,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                                                                
## [30] {Junction_Detail=Not at junction or within 20 metres,Casualties_Class=Small,Weather_Conditions=Fine no high winds} => {Junction_Control=Data missing or out of range}                                                                  
## [31] {Light_Conditions=Daylight,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                                                     
## [32] {Speed_limit=30,Junction_Detail=Not at junction or within 20 metres,Casualties_Class=Small} => {Junction_Control=Data missing or out of range}                                                                                         
## [33] {Day_Period=Morning,Road_Type=Single carriageway,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                               
## [34] {Road_Surface_Conditions=Dry,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                                                   
## [35] {Speed_limit=30,Day_Period=Afternoon,Junction_Detail=Not at junction or within 20 metres} => {Junction_Control=Data missing or out of range}                                                                                           
## [36] {Light_Conditions=Daylight,Junction_Detail=Not at junction or within 20 metres,Casualties_Class=Small} => {Junction_Control=Data missing or out of range}                                                                              
## [37] {Road_Surface_Conditions=Dry,Junction_Detail=Not at junction or within 20 metres,Weather_Conditions=Fine no high winds} => {Junction_Control=Data missing or out of range}                                                             
## 75935 Levels: {} => {Accident_Severity=Slight} ... {Weather_Conditions=Unknown} => {Casualties_Class=Small}

*Note: Of course the most commun ones appears first with strong conviction. However we are insterested in a more grain set of groups of itemsets and rules, namelly those that contain Fatal accidents as consequent.

Try to find rules that have in the consequent Accident_Severity=Fatal

# From our rules, subset then
rules_subset <- subset(rules, subset=(rhs %in% c("Accident_Severity=Fatal")))

# Check if any rules were generated
summary(rules_subset)
## set of 0 rules

*Note: The current parameters values were not able to generate the items that correspond to the fatal accidents as consequent. (Rare Cases)

Lets lower our standards trying to catch the fatal ones

# In order to save our memory we gonna subset the transactions that contain fatal ones
trans_sub <- subset(trans, items %in% "Accident_Severity=Fatal")

## Set new parameters
(1/nrow(trans_sub))
## [1] 0.0006188119
min_supp <- 50/nrow(trans_sub)
min_conf <- 0.00025

# Generate the rules with consequent as Accident_Severity=Fatal"
rules_subset <- apriori(trans_sub, parameter = list(supp=min_supp, conf=min_conf,
                                                target ="rules"),
                        appearance = list(rhs=c("Accident_Severity=Fatal")))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime    support minlen
##     0.00025    0.1    1 none FALSE            TRUE       5 0.03094059      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 50 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[79 item(s), 1616 transaction(s)] done [0.00s].
## sorting and recoding items ... [49 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10 done [0.06s].
## writing ... [21346 rule(s)] done [0.00s].
## creating S4 object  ... done [0.01s].

Check if any rules were generated

summary(rules_subset)
## set of 21346 rules
## 
## rule length distribution (lhs + rhs):sizes
##    1    2    3    4    5    6    7    8    9   10 
##    1   48  530 2209 4709 5750 4546 2440  897  216 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   5.000   6.000   6.097   7.000  10.000 
## 
## summary of quality measures:
##     support          confidence    coverage            lift       count       
##  Min.   :0.03094   Min.   :1    Min.   :0.03094   Min.   :1   Min.   :  50.0  
##  1st Qu.:0.03651   1st Qu.:1    1st Qu.:0.03651   1st Qu.:1   1st Qu.:  59.0  
##  Median :0.04703   Median :1    Median :0.04703   Median :1   Median :  76.0  
##  Mean   :0.06513   Mean   :1    Mean   :0.06513   Mean   :1   Mean   : 105.2  
##  3rd Qu.:0.07054   3rd Qu.:1    3rd Qu.:0.07054   3rd Qu.:1   3rd Qu.: 114.0  
##  Max.   :1.00000   Max.   :1    Max.   :1.00000   Max.   :1   Max.   :1616.0  
## 
## mining info:
##       data ntransactions    support confidence
##  trans_sub          1616 0.03094059    0.00025

Ok some rules where generated, inspect then

inspect(head(sort(subset(rules_subset, subset= rhs %pin% "Accident_Severity=Fatal")), by="lift"), n=5)
##     lhs                                                                                           rhs                         support confidence  coverage lift count
## [1] {}                                                                                         => {Accident_Severity=Fatal} 1.0000000          1 1.0000000    1  1616
## [2] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres} => {Accident_Severity=Fatal} 0.8620050          1 0.8620050    1  1393
## [3] {Weather_Conditions=Fine no high winds}                                                    => {Accident_Severity=Fatal} 0.8199257          1 0.8199257    1  1325
## [4] {Casualties_Class=Small}                                                                   => {Accident_Severity=Fatal} 0.8193069          1 0.8193069    1  1324
## [5] {Road_Type=Single carriageway}                                                             => {Accident_Severity=Fatal} 0.7493812          1 0.7493812    1  1211
## [6] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,                                                                       
##      Weather_Conditions=Fine no high winds}                                                    => {Accident_Severity=Fatal} 0.7054455          1 0.7054455    1  1140

Plot itemfrequency

itemFrequencyPlot(items(rules_subset), topN=10, cex.names=0.5) #(Too big to plot)

Show graph

plot(head(sort(rules_subset, by="lift"), 10),
  method="graph", control=list(cex=.7))

*Note: The diagram may suggest that the fatal accidents happens with no particular positive correlations (lift =1 -> Independent).

Lest find out in witch conditions accurs major accidents with many cars involved

# In order to save our memory we gonna subset the transactions that Casualties_Class=Large
trans_sub <- subset(trans, items %in% "Casualties_Class=Large")

# Set new parameters
(500/nrow(trans_sub))
## [1] 0.3810976
min_supp <- 500/nrow(trans_sub)
min_conf <- 0.00025

# Generate the rules with consequent as Accident_Severity=Fatal"
rules_subset <- apriori(trans_sub, parameter = list(supp=min_supp, conf=min_conf,
                                                    target ="rules"),
                        appearance = list(rhs=c("Casualties_Class=Large")))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime   support minlen
##     0.00025    0.1    1 none FALSE            TRUE       5 0.3810976      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 500 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[77 item(s), 1312 transaction(s)] done [0.00s].
## sorting and recoding items ... [13 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 done [0.00s].
## writing ... [65 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

Check if any rules were generated

summary(rules_subset)
## set of 65 rules
## 
## rule length distribution (lhs + rhs):sizes
##  1  2  3  4  5 
##  1 12 26 22  4 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   3.000   3.246   4.000   5.000 
## 
## summary of quality measures:
##     support         confidence    coverage           lift       count     
##  Min.   :0.3819   Min.   :1    Min.   :0.3819   Min.   :1   Min.   : 501  
##  1st Qu.:0.4200   1st Qu.:1    1st Qu.:0.4200   1st Qu.:1   1st Qu.: 551  
##  Median :0.4794   Median :1    Median :0.4794   Median :1   Median : 629  
##  Mean   :0.5092   Mean   :1    Mean   :0.5092   Mean   :1   Mean   : 668  
##  3rd Qu.:0.5610   3rd Qu.:1    3rd Qu.:0.5610   3rd Qu.:1   3rd Qu.: 736  
##  Max.   :1.0000   Max.   :1    Max.   :1.0000   Max.   :1   Max.   :1312  
## 
## mining info:
##       data ntransactions   support confidence
##  trans_sub          1312 0.3810976    0.00025

Ok some rules where generated, inspect then

inspect(head(sort(subset(rules_subset, subset= rhs %pin% "Casualties_Class=Large")), by="lift"), n=5)
##     lhs                                                                                           rhs                        support confidence  coverage lift count
## [1] {}                                                                                         => {Casualties_Class=Large} 1.0000000          1 1.0000000    1  1312
## [2] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres} => {Casualties_Class=Large} 0.8605183          1 0.8605183    1  1129
## [3] {Weather_Conditions=Fine no high winds}                                                    => {Casualties_Class=Large} 0.8170732          1 0.8170732    1  1072
## [4] {Pedestrian_Crossing-Physical_Facilities=No physical crossing facilities within 50 metres,                                                                      
##      Weather_Conditions=Fine no high winds}                                                    => {Casualties_Class=Large} 0.7065549          1 0.7065549    1   927
## [5] {Light_Conditions=Daylight}                                                                => {Casualties_Class=Large} 0.6996951          1 0.6996951    1   918
## [6] {Road_Type=Single carriageway}                                                             => {Casualties_Class=Large} 0.6928354          1 0.6928354    1   909

*Note: Clearely an rule regarding a high speed motorway

For The subset obtained with the entropy method, lets do a fine analisys

# Quick copy to enable reuse of code
subset <- subset_entropy

# Remove so save memory
rm(subset_entropy)

# Convert to transactions
trans <- as(subset, "transactions")
summary(trans)
(500/nrow(trans))
min_supp <- 0.0036
min_conf <- 0.5
min_lift <- 0.6
min_len <- 2
max_len <- 4

# Generate frequent itemsets
itemsets <- apriori(trans, parameter = list(target = "frequent",
                                            supp=min_supp, minlen = min_len, maxlen =max_len))

Inspect the 10 most frequent item

inspect(head(sort(itemsets), n=10))

Adding a extra measure of quality

quality(itemsets)$lift <- interestMeasure(itemsets, measure="lift", trans = trans)

# Lets analyse
inspect(head(sort(itemsets, by = "lift"), n=10))
trans_sub <- subset(trans, items %in% "Accident_Severity=Fatal")

## Generate the itemsets
itemsets_subset <- apriori(trans_sub, parameter = list(target = "frequent",
                                                       supp=min_supp, minlen = min_len, maxlen=max_len))

Inspect the 10 most frequent item

inspect(head(sort(itemsets_subset), n=10))

*Note: Police always attend fatal accidents, norma

Generate rules with rhs as fatal

Generate the rules with consequent as Accident_Severity=Fatal"

rules_subset <- apriori(trans_sub, parameter = list(supp=min_supp, conf=min_conf,
                                                    target ="rules"),
                        appearance = list(rhs=c("Accident_Severity=Fatal")))

Check if any rules were generated

summary(rules_subset)

Ok some rules where generated, inspect then

inspect(head(sort(subset(rules_subset, subset= rhs %pin% "Accident_Severity=Fatal")), by="lift"), n=5)

*Note: We see that the almost the same results were attained. Police always attend fata crashes {Junction_Detail=Not at junction or within 20 metres} => {Accident_Severity=Fatal} Ok no infraestructure or juctions.

3 - Modeling

Letf firt handle the incmolete cases and infer of just simply remove

# Staring with our last dataframe preprocessed
df_model <- df_subset # make shallow copy 

# Show number inconplete cases
(apply(is.na(df_model), 2, sum))
##                       Location_Easting_OSGR 
##                                          27 
##                      Location_Northing_OSGR 
##                                          27 
##                                   Longitude 
##                                          27 
##                                    Latitude 
##                                          27 
##                                Police_Force 
##                                           0 
##                           Accident_Severity 
##                                           0 
##                          Number_of_Vehicles 
##                                           0 
##                        Number_of_Casualties 
##                                           0 
##                                        Date 
##                                           0 
##                                 Day_of_Week 
##                                           0 
##                                        Time 
##                                           0 
##                    Local_Authority_District 
##                                           0 
##                            First_Road_Class 
##                                           0 
##                           First_Road_Number 
##                                           0 
##                                   Road_Type 
##                                           0 
##                                 Speed_limit 
##                                           0 
##                             Junction_Detail 
##                                           0 
##                            Junction_Control 
##                                           0 
##                           Second_Road_Class 
##                                       56603 
##                          Second_Road_Number 
##                                           0 
##           Pedestrian_Crossing-Human_Control 
##                                           0 
##     Pedestrian_Crossing-Physical_Facilities 
##                                           0 
##                            Light_Conditions 
##                                           0 
##                          Weather_Conditions 
##                                           0 
##                     Road_Surface_Conditions 
##                                           0 
##                  Special_Conditions_at_Site 
##                                           0 
##                         Carriageway_Hazards 
##                                           0 
##                         Urban_or_Rural_Area 
##                                           0 
## Did_Police_Officer_Attend_Scene_of_Accident 
##                                           8 
##                   LSOA_of_Accident_Location 
##                                           0 
##                                      Season 
##                                           0 
##                                       Month 
##                                           0 
##                                        Hour 
##                                          18 
##                                  Day_Period 
##                                          18 
##                            Casualties_Class 
##                                           0 
##                       Number_Vehicles_Class 
##                                           0
# Show number of entries
(nrow(df_model))
## [1] 140056
# Drop datasframes
#rm(df_subset)
#rm(df_copy)

*Note: Second_Road_Class has to many incomplete cases, just remove it, and locations esting and lsoa accident

Removing Column Second Road Class and Remove Other Incomplete Cases

# Lets drop initialy the time to see things, 
df_model <- df_model[, -c('Date', 'Time', 'Location_Easting_OSGR', 'Location_Northing_OSGR',
                              'LSOA_of_Accident_Location','Second_Road_Class', 'First_Road_Number', 'Second_Road_Number', 'Accident_Severity', 'Day_of_Week')]

# We have enough examples for modeling, just remove incomplete entries (only few of then)
df_model <- na.omit(df_model) 

# Show number incomplete cases
(apply(is.na(df_model), 2, sum))
##                                   Longitude 
##                                           0 
##                                    Latitude 
##                                           0 
##                                Police_Force 
##                                           0 
##                          Number_of_Vehicles 
##                                           0 
##                        Number_of_Casualties 
##                                           0 
##                    Local_Authority_District 
##                                           0 
##                            First_Road_Class 
##                                           0 
##                                   Road_Type 
##                                           0 
##                                 Speed_limit 
##                                           0 
##                             Junction_Detail 
##                                           0 
##                            Junction_Control 
##                                           0 
##           Pedestrian_Crossing-Human_Control 
##                                           0 
##     Pedestrian_Crossing-Physical_Facilities 
##                                           0 
##                            Light_Conditions 
##                                           0 
##                          Weather_Conditions 
##                                           0 
##                     Road_Surface_Conditions 
##                                           0 
##                  Special_Conditions_at_Site 
##                                           0 
##                         Carriageway_Hazards 
##                                           0 
##                         Urban_or_Rural_Area 
##                                           0 
## Did_Police_Officer_Attend_Scene_of_Accident 
##                                           0 
##                                      Season 
##                                           0 
##                                       Month 
##                                           0 
##                                        Hour 
##                                           0 
##                                  Day_Period 
##                                           0 
##                            Casualties_Class 
##                                           0 
##                       Number_Vehicles_Class 
##                                           0
# Show number of entries
(nrow(df_model))
## [1] 140003
# Show number of levels each column (Levels columns only)
(apply( df_model, 2, function(x) length( unique(x) )))
##                                   Longitude 
##                                      133293 
##                                    Latitude 
##                                      106277 
##                                Police_Force 
##                                          51 
##                          Number_of_Vehicles 
##                                          16 
##                        Number_of_Casualties 
##                                          22 
##                    Local_Authority_District 
##                                         380 
##                            First_Road_Class 
##                                           6 
##                                   Road_Type 
##                                           6 
##                                 Speed_limit 
##                                           8 
##                             Junction_Detail 
##                                          10 
##                            Junction_Control 
##                                           5 
##           Pedestrian_Crossing-Human_Control 
##                                           4 
##     Pedestrian_Crossing-Physical_Facilities 
##                                           7 
##                            Light_Conditions 
##                                           5 
##                          Weather_Conditions 
##                                           9 
##                     Road_Surface_Conditions 
##                                           6 
##                  Special_Conditions_at_Site 
##                                           9 
##                         Carriageway_Hazards 
##                                           7 
##                         Urban_or_Rural_Area 
##                                           2 
## Did_Police_Officer_Attend_Scene_of_Accident 
##                                           3 
##                                      Season 
##                                           4 
##                                       Month 
##                                          12 
##                                        Hour 
##                                          24 
##                                  Day_Period 
##                                           4 
##                            Casualties_Class 
##                                           4 
##                       Number_Vehicles_Class 
##                                           3
# Quick view on data
(head(df_model))
##    Longitude Latitude        Police_Force Number_of_Vehicles
## 1: -0.198465 51.50554 Metropolitan Police                  1
## 2: -0.178838 51.49184 Metropolitan Police                  1
## 3: -0.205590 51.51491 Metropolitan Police                  1
## 4: -0.208327 51.51495 Metropolitan Police                  1
## 5: -0.206022 51.49657 Metropolitan Police                  2
## 6: -0.193610 51.50079 Metropolitan Police                  2
##    Number_of_Casualties Local_Authority_District First_Road_Class
## 1:                    1   Kensington and Chelsea                C
## 2:                    1   Kensington and Chelsea     Unclassified
## 3:                    1   Kensington and Chelsea                B
## 4:                    1   Kensington and Chelsea                B
## 5:                    1   Kensington and Chelsea                A
## 6:                    1   Kensington and Chelsea                A
##             Road_Type Speed_limit         Junction_Detail
## 1: Single carriageway          30 T or staggered junction
## 2: Single carriageway          30 T or staggered junction
## 3: Single carriageway          30         Mini-roundabout
## 4: Single carriageway          30              Crossroads
## 5: Single carriageway          30              Crossroads
## 6: Single carriageway          30 T or staggered junction
##            Junction_Control Pedestrian_Crossing-Human_Control
## 1: Give way or uncontrolled             None within 50 metres
## 2: Give way or uncontrolled             None within 50 metres
## 3: Give way or uncontrolled             None within 50 metres
## 4: Give way or uncontrolled             None within 50 metres
## 5:      Auto traffic signal             None within 50 metres
## 6: Give way or uncontrolled             None within 50 metres
##                                      Pedestrian_Crossing-Physical_Facilities
## 1:                          No physical crossing facilities within 50 metres
## 2:                          No physical crossing facilities within 50 metres
## 3:                                                                     Zebra
## 4:                          No physical crossing facilities within 50 metres
## 5:                               Pedestrian phase at traffic signal junction
## 6: Pelican, puffin, toucan or similar non-junction pedestrian light crossing
##         Light_Conditions    Weather_Conditions Road_Surface_Conditions
## 1: Darkness - lights lit    Fine no high winds                     Dry
## 2:              Daylight    Fine no high winds                     Dry
## 3: Darkness - lights lit Raining no high winds             Wet or damp
## 4:              Daylight    Fine no high winds             Wet or damp
## 5:              Daylight Raining no high winds             Wet or damp
## 6:              Daylight    Fine no high winds             Wet or damp
##    Special_Conditions_at_Site Carriageway_Hazards Urban_or_Rural_Area
## 1:                       None                None               Urban
## 2:                       None                None               Urban
## 3:                       None                None               Urban
## 4:                       None                None               Urban
## 5:                       None                None               Urban
## 6:                       None                None               Urban
##    Did_Police_Officer_Attend_Scene_of_Accident Season Month Hour Day_Period
## 1:                                         Yes Winter   Jan   18  Afternoon
## 2:                                         Yes Winter   Jan    7    Morning
## 3:                                         Yes Winter   Jan   18  Afternoon
## 4:                                          No Winter   Jan    7    Morning
## 5:                                          No Winter   Jan    7    Morning
## 6:                                         Yes Winter   Jan    9    Morning
##    Casualties_Class Number_Vehicles_Class
## 1:            Small                 Small
## 2:            Small                 Small
## 3:            Small                 Small
## 4:            Small                 Small
## 5:            Small                 Small
## 6:            Small                 Small
# Quick vies on column type see if all its ok
(str(df_model))
## Classes 'data.table' and 'data.frame':   140003 obs. of  26 variables:
##  $ Longitude                                  : num  -0.198 -0.179 -0.206 -0.208 -0.206 ...
##  $ Latitude                                   : num  51.5 51.5 51.5 51.5 51.5 ...
##  $ Police_Force                               : chr  "Metropolitan Police" "Metropolitan Police" "Metropolitan Police" "Metropolitan Police" ...
##  $ Number_of_Vehicles                         : int  1 1 1 1 2 2 2 2 2 2 ...
##  $ Number_of_Casualties                       : int  1 1 1 1 1 1 1 1 1 2 ...
##  $ Local_Authority_District                   : chr  "Kensington and Chelsea" "Kensington and Chelsea" "Kensington and Chelsea" "Kensington and Chelsea" ...
##  $ First_Road_Class                           : chr  "C" "Unclassified" "B" "B" ...
##  $ Road_Type                                  : chr  "Single carriageway" "Single carriageway" "Single carriageway" "Single carriageway" ...
##  $ Speed_limit                                : Factor w/ 8 levels "0","10","20",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Junction_Detail                            : chr  "T or staggered junction" "T or staggered junction" "Mini-roundabout" "Crossroads" ...
##  $ Junction_Control                           : chr  "Give way or uncontrolled" "Give way or uncontrolled" "Give way or uncontrolled" "Give way or uncontrolled" ...
##  $ Pedestrian_Crossing-Human_Control          : chr  "None within 50 metres" "None within 50 metres" "None within 50 metres" "None within 50 metres" ...
##  $ Pedestrian_Crossing-Physical_Facilities    : chr  "No physical crossing facilities within 50 metres" "No physical crossing facilities within 50 metres" "Zebra" "No physical crossing facilities within 50 metres" ...
##  $ Light_Conditions                           : chr  "Darkness - lights lit" "Daylight" "Darkness - lights lit" "Daylight" ...
##  $ Weather_Conditions                         : chr  "Fine no high winds" "Fine no high winds" "Raining no high winds" "Fine no high winds" ...
##  $ Road_Surface_Conditions                    : chr  "Dry" "Dry" "Wet or damp" "Wet or damp" ...
##  $ Special_Conditions_at_Site                 : chr  "None" "None" "None" "None" ...
##  $ Carriageway_Hazards                        : chr  "None" "None" "None" "None" ...
##  $ Urban_or_Rural_Area                        : chr  "Urban" "Urban" "Urban" "Urban" ...
##  $ Did_Police_Officer_Attend_Scene_of_Accident: chr  "Yes" "Yes" "Yes" "No" ...
##  $ Season                                     : chr  "Winter" "Winter" "Winter" "Winter" ...
##  $ Month                                      : Factor w/ 12 levels "Apr","Aug","Dec",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ Hour                                       : num  18 7 18 7 7 9 9 15 6 15 ...
##  $ Day_Period                                 : Factor w/ 4 levels "Night","Morning",..: 3 2 3 2 2 2 2 3 1 3 ...
##  $ Casualties_Class                           : Factor w/ 4 levels "Small","Medium",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Number_Vehicles_Class                      : Factor w/ 3 levels "Small","Medium",..: 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>
## NULL

*Note: perhaps latitude and longitus is to descriminative, lest drop it for now and find a way to summarize this later

Drop Latitude and logitude

# Lets drop initialy the time to see things
df_model <- df_model[, -c('Latitude', 'Longitude')]

# Show number of levels each column (Levels columns only)
(apply( df_model, 2, function(x) length( unique(x) )))
##                                Police_Force 
##                                          51 
##                          Number_of_Vehicles 
##                                          16 
##                        Number_of_Casualties 
##                                          22 
##                    Local_Authority_District 
##                                         380 
##                            First_Road_Class 
##                                           6 
##                                   Road_Type 
##                                           6 
##                                 Speed_limit 
##                                           8 
##                             Junction_Detail 
##                                          10 
##                            Junction_Control 
##                                           5 
##           Pedestrian_Crossing-Human_Control 
##                                           4 
##     Pedestrian_Crossing-Physical_Facilities 
##                                           7 
##                            Light_Conditions 
##                                           5 
##                          Weather_Conditions 
##                                           9 
##                     Road_Surface_Conditions 
##                                           6 
##                  Special_Conditions_at_Site 
##                                           9 
##                         Carriageway_Hazards 
##                                           7 
##                         Urban_or_Rural_Area 
##                                           2 
## Did_Police_Officer_Attend_Scene_of_Accident 
##                                           3 
##                                      Season 
##                                           4 
##                                       Month 
##                                          12 
##                                        Hour 
##                                          24 
##                                  Day_Period 
##                                           4 
##                            Casualties_Class 
##                                           4 
##                       Number_Vehicles_Class 
##                                           3
# Debug
(nrow(df_model))
## [1] 140003

Considering the fact that we want to predict the number of fatalities given features lets model an ML to capture this

Lest split out testset (stratifyed?)

# Convert chars to factors in this reduced space
#df_model <- mutate_if(df_model, is.character, as.factor)

# Wuick pick
(nrow(df_model))
## [1] 140003
# Show number incomplete cases
(apply(is.na(df_model), 2, sum))
##                                Police_Force 
##                                           0 
##                          Number_of_Vehicles 
##                                           0 
##                        Number_of_Casualties 
##                                           0 
##                    Local_Authority_District 
##                                           0 
##                            First_Road_Class 
##                                           0 
##                                   Road_Type 
##                                           0 
##                                 Speed_limit 
##                                           0 
##                             Junction_Detail 
##                                           0 
##                            Junction_Control 
##                                           0 
##           Pedestrian_Crossing-Human_Control 
##                                           0 
##     Pedestrian_Crossing-Physical_Facilities 
##                                           0 
##                            Light_Conditions 
##                                           0 
##                          Weather_Conditions 
##                                           0 
##                     Road_Surface_Conditions 
##                                           0 
##                  Special_Conditions_at_Site 
##                                           0 
##                         Carriageway_Hazards 
##                                           0 
##                         Urban_or_Rural_Area 
##                                           0 
## Did_Police_Officer_Attend_Scene_of_Accident 
##                                           0 
##                                      Season 
##                                           0 
##                                       Month 
##                                           0 
##                                        Hour 
##                                           0 
##                                  Day_Period 
##                                           0 
##                            Casualties_Class 
##                                           0 
##                       Number_Vehicles_Class 
##                                           0
#rm(df_copy)
rm(map)

# convert to upper case coliumns, some models dont handle this well
for( i in colnames(df_model)){
    colnames(df_model)[which(colnames(df_model)==i)] = toupper(i)
 }


# Ok, there is here a column messi, lest remove the -
setnames( df_model  , "PEDESTRIAN_CROSSING-HUMAN_CONTROL"  , "PEDESTRIAN_CROSSING_HUMAN_CONTROL"  )
setnames( df_model  , "PEDESTRIAN_CROSSING-PHYSICAL_FACILITIES"  , "PEDESTRIAN_CROSSING_PHYSICAL_FACILITIES")


# Ensure thet they are factors
library(dplyr)
df_model=df_model %>% mutate_if(is.character, as.factor)

# This column has 53 categoriees, lest drop them
df_model <- df_model[, -c('LOCAL_AUTHORITY_DISTRICT')]

#Create an small sample (2000 cases)
sample_perc <- 0.05

# Extract samples for hold out method
set.seed(1234) # for reproduction
idx_sample <-sample(1:nrow(df_model),as.integer(sample_perc*nrow(df_model)))

length(idx_sample)
## [1] 7000
df_model <- df_model[idx_sample, ]


# Set the threshold
split_val <- 0.7

# Extract samples for hold out method
set.seed(1234) # for reproduction
idx.tr <-sample(1:nrow(df_model),as.integer(split_val*nrow(df_model)))
tr <- df_model[idx.tr,]
ts <- df_model[-idx.tr,] 

ts = rbind(tr[1,],ts)
ts = ts[-1,]

# We should ensure a stratifyed sample
#library(caret)

#idx.tr <- createDataPartition(df_model, p = split_val, list = FALSE)
#tr <- df_model[idx.tr,]
#ts <- df_model[-idx.tr,] 

# Debug
(str(tr))
## Classes 'data.table' and 'data.frame':   4900 obs. of  23 variables:
##  $ POLICE_FORCE                               : Factor w/ 51 levels "Avon and Somerset",..: 31 28 30 15 26 50 47 10 24 3 ...
##  $ NUMBER_OF_VEHICLES                         : int  1 2 1 2 2 1 1 2 2 2 ...
##  $ NUMBER_OF_CASUALTIES                       : int  1 1 1 1 1 2 1 2 1 1 ...
##  $ FIRST_ROAD_CLASS                           : Factor w/ 6 levels "A","A(M)","B",..: 4 1 6 3 1 1 6 3 1 3 ...
##  $ ROAD_TYPE                                  : Factor w/ 6 levels "Dual carriageway",..: 4 4 4 4 4 1 4 4 1 4 ...
##  $ SPEED_LIMIT                                : Factor w/ 8 levels "0","10","20",..: 7 6 4 7 4 5 4 7 5 7 ...
##  $ JUNCTION_DETAIL                            : Factor w/ 10 levels "Crossroads","Data missing or out of range",..: 5 10 10 10 8 5 5 10 1 5 ...
##  $ JUNCTION_CONTROL                           : Factor w/ 5 levels "Authorised person",..: 3 4 4 4 2 3 3 4 4 3 ...
##  $ PEDESTRIAN_CROSSING_HUMAN_CONTROL          : Factor w/ 4 levels "Control by other authorised person",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ PEDESTRIAN_CROSSING_PHYSICAL_FACILITIES    : Factor w/ 7 levels "Central refuge",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ LIGHT_CONDITIONS                           : Factor w/ 5 levels "Darkness - lighting unknown",..: 5 2 5 5 5 5 2 5 2 4 ...
##  $ WEATHER_CONDITIONS                         : Factor w/ 9 levels "Fine + high winds",..: 2 2 2 2 2 2 5 2 2 6 ...
##  $ ROAD_SURFACE_CONDITIONS                    : Factor w/ 6 levels "Data missing or out of range",..: 2 2 2 2 2 2 6 2 2 6 ...
##  $ SPECIAL_CONDITIONS_AT_SITE                 : Factor w/ 9 levels "Auto signal part defective",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ CARRIAGEWAY_HAZARDS                        : Factor w/ 7 levels "Any animal in carriageway (except ridden horse)",..: 3 3 3 3 3 3 1 3 3 3 ...
##  $ URBAN_OR_RURAL_AREA                        : Factor w/ 2 levels "Rural","Urban": 1 1 2 1 1 2 1 1 1 1 ...
##  $ DID_POLICE_OFFICER_ATTEND_SCENE_OF_ACCIDENT: Factor w/ 3 levels "No","No - accident was reported using a self completion  form (self rep only)",..: 3 3 1 3 1 3 3 3 3 3 ...
##  $ SEASON                                     : Factor w/ 4 levels "Fall","Spring",..: 2 3 4 4 4 3 1 4 1 1 ...
##  $ MONTH                                      : Factor w/ 12 levels "Apr","Aug","Dec",..: 9 12 5 8 8 7 10 8 12 11 ...
##  $ HOUR                                       : num  19 23 13 8 12 15 19 16 21 6 ...
##  $ DAY_PERIOD                                 : Factor w/ 4 levels "Night","Morning",..: 4 4 3 2 2 3 4 3 4 1 ...
##  $ CASUALTIES_CLASS                           : Factor w/ 4 levels "Small","Medium",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ NUMBER_VEHICLES_CLASS                      : Factor w/ 3 levels "Small","Medium",..: 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>
## NULL
(nrow(tr))
## [1] 4900
(nrow(ts))
## [1] 2100

Setup an Simple baseline using trees (considering the size of dataset, gonna use holdout)

# Make several estimations and select the best model
res_TREES <- performanceEstimation(
  PredTask(NUMBER_OF_CASUALTIES~ .,tr),
  workflowVariants(learner="rpartXse",learner.pars=list(se=c(0,0.5,1, 2, 3))),
  EstimationTask(metrics="mse",method=Holdout(nReps=3,hldSz=0.3)))

# Save the model results
save(res_TREES, file = "res_TREES1.RData")

Load TREES

# Load the data
load(file = "res_TREES1.RData")

# Plot the graphs
plot(res_TREES)

# See the performance rank
topPerformers(res_TREES)
## $tr.NUMBER_OF_CASUALTIES
##        Workflow Estimate
## mse rpartXse.v1    0.243
# Get the parameters
(getWorkflow("rpartXse.v2",res_TREES))
## Workflow Object:
##  Workflow ID       ::  rpartXse.v2 
##  Workflow Function ::  standardWF
##       Parameter values:
##       learner.pars  -> se=0.5 
##       learner  -> rpartXse
  • Note: We see that increasing the number of se leads to wprse results

Setup an RF Model (here im preloding models avoid training time)

require('randomForest')

#df_model1 <- df_model[, -c('LOCAL_AUTHORITY_DISTRICT')]

# Make several estimations and select the best model
res_RF <- performanceEstimation(
  PredTask(NUMBER_OF_CASUALTIES ~ .,tr),
  workflowVariants(learner="randomForest", learner.pars=list(ntree= c(100, 500, 1000))), 
 EstimationTask(metrics="mse",method=Holdout(nReps=3,hldSz=0.3)))

# Save the model results
save(res_RF, file = "res_RF.RData")

Load RF

# Load the data
load(file = "res_RF.RData")

# Plot the graphs
plot(res_RF)

# See the performance rank
topPerformers(res_RF)
## $tr.NUMBER_OF_CASUALTIES
##            Workflow Estimate
## mse randomForest.v3    0.264

Setup an SVM Model

# Make several estimations and select the best model
res_SVM <- performanceEstimation(
  PredTask(NUMBER_OF_CASUALTIES ~ .,tr),
  workflowVariants(learner="svm", learner.pars=list(kernel=c("linear", "radial"), cost=c(1,3), gamma=c(0.1,0.01), epsilon=0.1)),

 EstimationTask(metrics="mse",method=Holdout(nReps=3,hldSz=0.3)))

# Save the model results
save(res_SVM, file = "res_SVM.RData")

Load SVM

# Load the data
load(file = "res_SVM.RData")

# Plot the graphs
plot(res_SVM)

# See the performance rank
topPerformers(res_SVM)
## $tr.NUMBER_OF_CASUALTIES
##     Workflow Estimate
## mse   svm.v3    0.254

Setup Several models

# Make several estimations and select the best model
res_EXP <- performanceEstimation(
   PredTask(NUMBER_OF_CASUALTIES ~ ., tr),
   c( Workflow(learner="naiveBayes"),
        workflowVariants(learner="svm", learner.pars=list(kernel=c("linear", "radial"), cost=c(1,3))),
      Workflow(learner="randomForest", learner.pars=list( ntree=3000))
        ),
   EstimationTask(metrics="mse",method=Holdout(nReps=3,hldSz=0.3)))

# Save the model results
save(res_EXP, file = "res_EXP.RData")

Load EXP

# Load the data
load(file = "res_EXP.RData")

# Plot the graphs
plot(res_EXP)

# See the performance rank
topPerformers(res_EXP)
## $tr.NUMBER_OF_CASUALTIES
##     Workflow Estimate
## mse   svm.v3    0.254

Set a buch of modle with default parameters

res_MODELS <- performanceEstimation(
  PredTask(NUMBER_OF_CASUALTIES ~ ., tr),
  workflowVariants("standardWF",
                   learner=c("rpartXse","svm","randomForest")),
  EstimationTask(metrics="mse",method=Holdout(nReps=3,hldSz=0.3)))

# Save the model results
save(res_MODELS, file = "res_MODELS.RData")
# Load the data
load(file = "res_MODELS.RData")

# Plot the graphs
plot(res_MODELS)

# See the performance rank
topPerformers(res_MODELS)
## $tr.NUMBER_OF_CASUALTIES
##     Workflow Estimate
## mse rpartXse    0.259

4 - Clustering

Lets try some Clustering and Analyse (first we gonna start by dropping some locations and time columns, and some incomplete columns) Whe are interested in predict the number of casualties, so we drop the target variable alsp

require('cluster')
require(cluster)
require(dplyr)
require(ggplot2)
require(readr)
require(Rtsne)

# Some intial colum name analisis
(colnames(df_subset))
##  [1] "Location_Easting_OSGR"                      
##  [2] "Location_Northing_OSGR"                     
##  [3] "Longitude"                                  
##  [4] "Latitude"                                   
##  [5] "Police_Force"                               
##  [6] "Accident_Severity"                          
##  [7] "Number_of_Vehicles"                         
##  [8] "Number_of_Casualties"                       
##  [9] "Date"                                       
## [10] "Day_of_Week"                                
## [11] "Time"                                       
## [12] "Local_Authority_District"                   
## [13] "First_Road_Class"                           
## [14] "First_Road_Number"                          
## [15] "Road_Type"                                  
## [16] "Speed_limit"                                
## [17] "Junction_Detail"                            
## [18] "Junction_Control"                           
## [19] "Second_Road_Class"                          
## [20] "Second_Road_Number"                         
## [21] "Pedestrian_Crossing-Human_Control"          
## [22] "Pedestrian_Crossing-Physical_Facilities"    
## [23] "Light_Conditions"                           
## [24] "Weather_Conditions"                         
## [25] "Road_Surface_Conditions"                    
## [26] "Special_Conditions_at_Site"                 
## [27] "Carriageway_Hazards"                        
## [28] "Urban_or_Rural_Area"                        
## [29] "Did_Police_Officer_Attend_Scene_of_Accident"
## [30] "LSOA_of_Accident_Location"                  
## [31] "Season"                                     
## [32] "Month"                                      
## [33] "Hour"                                       
## [34] "Day_Period"                                 
## [35] "Casualties_Class"                           
## [36] "Number_Vehicles_Class"
# Lets drop initialy the time to see things
df_notime <- df_subset[, -c('Date', 'Time', 'Location_Easting_OSGR', 'Location_Northing_OSGR', 'LSOA_of_Accident_Location','Second_Road_Class', 'Number_of_Casualties')]

# Access the number of levels
(apply(df_notime, 2, function(x) length( unique(x) )))
##                                   Longitude 
##                                      133317 
##                                    Latitude 
##                                      106293 
##                                Police_Force 
##                                          51 
##                           Accident_Severity 
##                                           3 
##                          Number_of_Vehicles 
##                                          16 
##                                 Day_of_Week 
##                                           7 
##                    Local_Authority_District 
##                                         380 
##                            First_Road_Class 
##                                           6 
##                           First_Road_Number 
##                                        4340 
##                                   Road_Type 
##                                           6 
##                                 Speed_limit 
##                                           8 
##                             Junction_Detail 
##                                          10 
##                            Junction_Control 
##                                           5 
##                          Second_Road_Number 
##                                        3795 
##           Pedestrian_Crossing-Human_Control 
##                                           4 
##     Pedestrian_Crossing-Physical_Facilities 
##                                           7 
##                            Light_Conditions 
##                                           5 
##                          Weather_Conditions 
##                                           9 
##                     Road_Surface_Conditions 
##                                           6 
##                  Special_Conditions_at_Site 
##                                           9 
##                         Carriageway_Hazards 
##                                           7 
##                         Urban_or_Rural_Area 
##                                           2 
## Did_Police_Officer_Attend_Scene_of_Accident 
##                                           4 
##                                      Season 
##                                           4 
##                                       Month 
##                                          12 
##                                        Hour 
##                                          25 
##                                  Day_Period 
##                                           5 
##                            Casualties_Class 
##                                           4 
##                       Number_Vehicles_Class 
##                                           3
# Access the incomplete cases
(apply(is.na(df_notime), 2, sum))
##                                   Longitude 
##                                          27 
##                                    Latitude 
##                                          27 
##                                Police_Force 
##                                           0 
##                           Accident_Severity 
##                                           0 
##                          Number_of_Vehicles 
##                                           0 
##                                 Day_of_Week 
##                                           0 
##                    Local_Authority_District 
##                                           0 
##                            First_Road_Class 
##                                           0 
##                           First_Road_Number 
##                                           0 
##                                   Road_Type 
##                                           0 
##                                 Speed_limit 
##                                           0 
##                             Junction_Detail 
##                                           0 
##                            Junction_Control 
##                                           0 
##                          Second_Road_Number 
##                                           0 
##           Pedestrian_Crossing-Human_Control 
##                                           0 
##     Pedestrian_Crossing-Physical_Facilities 
##                                           0 
##                            Light_Conditions 
##                                           0 
##                          Weather_Conditions 
##                                           0 
##                     Road_Surface_Conditions 
##                                           0 
##                  Special_Conditions_at_Site 
##                                           0 
##                         Carriageway_Hazards 
##                                           0 
##                         Urban_or_Rural_Area 
##                                           0 
## Did_Police_Officer_Attend_Scene_of_Accident 
##                                           8 
##                                      Season 
##                                           0 
##                                       Month 
##                                           0 
##                                        Hour 
##                                          18 
##                                  Day_Period 
##                                          18 
##                            Casualties_Class 
##                                           0 
##                       Number_Vehicles_Class 
##                                           0
# Count incomplete cases
sum(apply(df_notime, 1, function(x){any(is.na(x))}))
## [1] 53
# Remove those incomplete cases in small number
df_notime <- na.omit(df_notime)

Lets just subset our dataset for memory issues

# Ok, we have memory problems, lest just pick a small subset of rows
split_val <- 0.05

# Extract samples for hold out method
set.seed(1234) # for reproduction
idx.tr <-sample(1:nrow(df_notime),as.integer(split_val*nrow(df_notime)))
df_notime_sample <- df_notime[idx.tr,]

# Convert chars to factors in this reduced space
df_notime_sample <- mutate_if(df_notime_sample, is.character, as.factor)

# Wuick pick
(nrow(df_notime_sample))
## [1] 7000

Explore Gower distance, enabling to computed as the average of partial dissimilarities across individuals.

# Compute Gower distance
gower_dist <- daisy(df_notime_sample, metric = "gower")
gower_mat <- as.matrix(gower_dist)

#' Print most similar clients
(df_notime_sample[which(gower_mat == min(gower_mat[gower_mat != min(gower_mat)]), arr.ind = TRUE)[1, ], ])
##    Longitude Latitude Police_Force Accident_Severity Number_of_Vehicles
## 1:  0.285162 51.31869         Kent            Slight                  3
## 2:  0.504544 51.34127         Kent            Slight                  3
##    Day_of_Week Local_Authority_District First_Road_Class First_Road_Number
## 1:    Saturday    Tonbridge and Malling         Motorway                20
## 2:    Saturday    Tonbridge and Malling         Motorway                 2
##           Road_Type Speed_limit                     Junction_Detail
## 1: Dual carriageway          70 Not at junction or within 20 metres
## 2: Dual carriageway          70 Not at junction or within 20 metres
##                Junction_Control Second_Road_Number
## 1: Data missing or out of range                  0
## 2: Data missing or out of range                  0
##    Pedestrian_Crossing-Human_Control
## 1:             None within 50 metres
## 2:             None within 50 metres
##             Pedestrian_Crossing-Physical_Facilities Light_Conditions
## 1: No physical crossing facilities within 50 metres         Daylight
## 2: No physical crossing facilities within 50 metres         Daylight
##    Weather_Conditions Road_Surface_Conditions Special_Conditions_at_Site
## 1: Fine no high winds                     Dry                       None
## 2: Fine no high winds                     Dry                       None
##    Carriageway_Hazards Urban_or_Rural_Area
## 1:                None               Rural
## 2:                None               Rural
##    Did_Police_Officer_Attend_Scene_of_Accident Season Month Hour Day_Period
## 1:                                         Yes   Fall   Oct    9    Morning
## 2:                                         Yes   Fall   Oct    9    Morning
##    Casualties_Class Number_Vehicles_Class
## 1:            Small                Medium
## 2:            Small                Medium
#' Print most dissimilar clients
(df_notime_sample[which(gower_mat == max(gower_mat[gower_mat != max(gower_mat)]), arr.ind = TRUE)[1, ], ])
##    Longitude Latitude    Police_Force Accident_Severity Number_of_Vehicles
## 1: -1.237951  53.4900 South Yorkshire           Serious                  1
## 2: -2.192345  52.9077   Staffordshire            Slight                  4
##    Day_of_Week Local_Authority_District First_Road_Class First_Road_Number
## 1:      Friday                Doncaster                A              6023
## 2:     Tuesday                 Stafford         Motorway                 6
##             Road_Type Speed_limit                     Junction_Detail
## 1: Single carriageway          40                          Crossroads
## 2:   Dual carriageway          70 Not at junction or within 20 metres
##                Junction_Control Second_Road_Number
## 1:     Give way or uncontrolled                  0
## 2: Data missing or out of range                  0
##    Pedestrian_Crossing-Human_Control
## 1:             None within 50 metres
## 2:      Data missing or out of range
##             Pedestrian_Crossing-Physical_Facilities      Light_Conditions
## 1: No physical crossing facilities within 50 metres Darkness - lights lit
## 2:                     Data missing or out of range              Daylight
##       Weather_Conditions Road_Surface_Conditions   Special_Conditions_at_Site
## 1: Raining no high winds             Wet or damp                         None
## 2:    Fine no high winds                     Dry Data missing or out of range
##             Carriageway_Hazards Urban_or_Rural_Area
## 1:                         None               Urban
## 2: Data missing or out of range               Rural
##    Did_Police_Officer_Attend_Scene_of_Accident Season Month Hour Day_Period
## 1:                                         Yes Winter   Mar    2      Night
## 2:                                          No Summer   Sep   16  Afternoon
##    Casualties_Class Number_Vehicles_Class
## 1:            Small                 Small
## 2:            Small                Medium

Partial dissimilarities (d_ij^f) computation depend on the type of variable being evaluated.

The PAM algorithm searches for the k representative objects (the medoids) among the cases in the given data set.

As with k-means each observation is allocated to the nearest medoid.

PAM is more robust to the presence of outliers because it uses original objects as centroids instead of averages that may be subject to the effects of outliers

sil_width <- c(NA)
for(i in 2:8){  
  pam_fit <- pam(gower_dist, diss = TRUE, k = i)  
  sil_width[i] <- pam_fit$silinfo$avg.width  
}

plot(1:8, sil_width,
     xlab = "Number of clusters",
     ylab = "Silhouette Width")
lines(1:8, sil_width)

  • Note: Acccording to the elbow technique, the optimal number of cluster is 5

Lets form the clusters with 6 distictivive classes

k <- 5
pam_fit <- pam(gower_dist, diss = TRUE, k)
pam_results <- df_notime_sample %>%
  mutate(cluster = pam_fit$clustering) %>%
  group_by(cluster) %>%
  do(the_summary = summary(.))
pam_results$the_summary
## [[1]]
##    Longitude          Latitude                 Police_Force  Accident_Severity
##  Min.   :-6.0995   Min.   :50.07   Thames Valley     :  69   Fatal  :  34     
##  1st Qu.:-2.5907   1st Qu.:51.45   Kent              :  65   Serious: 250     
##  Median :-1.2756   Median :52.34   Surrey            :  65   Slight :1154     
##  Mean   :-1.4956   Mean   :52.64   Devon and Cornwall:  58                    
##  3rd Qu.:-0.3068   3rd Qu.:53.41   Sussex            :  55                    
##  Max.   : 1.7025   Max.   :60.31   Hampshire         :  50                    
##                                    (Other)           :1076                    
##  Number_of_Vehicles    Day_of_Week    Local_Authority_District
##  Min.   :1.000      Friday   :198   Cornwall      :  21       
##  1st Qu.:1.000      Monday   :199   Cheshire East :  19       
##  Median :2.000      Saturday :178   Northumberland:  18       
##  Mean   :1.882      Sunday   :174   Doncaster     :  16       
##  3rd Qu.:2.000      Thursday :201   Peterborough  :  16       
##  Max.   :7.000      Tuesday  :158   Highland      :  14       
##                     Wednesday:330   (Other)       :1334       
##      First_Road_Class First_Road_Number              Road_Type    Speed_limit 
##  A           :843     Min.   :   0.0    Dual carriageway  :384   60     :689  
##  A(M)        : 12     1st Qu.:  20.0    One way street    :  8   70     :302  
##  B           :176     Median : 143.0    Roundabout        : 53   30     :167  
##  C           :102     Mean   : 815.5    Single carriageway:959   40     :167  
##  Motorway    :143     3rd Qu.: 648.2    Slip road         : 27   50     :107  
##  Unclassified:162     Max.   :9157.0    Unknown           :  7   20     :  6  
##                                                                  (Other):  0  
##                             Junction_Detail
##  Not at junction or within 20 metres:1171  
##  Roundabout                         :  79  
##  Crossroads                         :  60  
##  T or staggered junction            :  60  
##  Private drive or entrance          :  29  
##  Other junction                     :  20  
##  (Other)                            :  19  
##                      Junction_Control Second_Road_Number
##  Authorised person           :   0    Min.   :  -1      
##  Auto traffic signal         :  48    1st Qu.:   0      
##  Data missing or out of range:1173    Median :   0      
##  Give way or uncontrolled    : 209    Mean   : 171      
##  Stop sign                   :   8    3rd Qu.:   0      
##                                       Max.   :9310      
##                                                         
##                   Pedestrian_Crossing-Human_Control
##  Control by other authorised person:   2           
##  Control by school crossing patrol :   0           
##  Data missing or out of range      :   2           
##  None within 50 metres             :1434           
##                                                    
##                                                    
##                                                    
##                                                       Pedestrian_Crossing-Physical_Facilities
##  Central refuge                                                           :  10              
##  Data missing or out of range                                             :   2              
##  Footbridge or subway                                                     :   5              
##  No physical crossing facilities within 50 metres                         :1371              
##  Pedestrian phase at traffic signal junction                              :  22              
##  Pelican, puffin, toucan or similar non-junction pedestrian light crossing:  19              
##  Zebra                                                                    :   9              
##                     Light_Conditions             Weather_Conditions
##  Darkness - lighting unknown:  18    Fine no high winds   :1137    
##  Darkness - lights lit      : 100    Raining no high winds: 179    
##  Darkness - lights unlit    :   8    Fine + high winds    :  32    
##  Darkness - no lighting     : 223    Raining + high winds :  31    
##  Daylight                   :1089    Other                :  23    
##                                      Unknown              :  16    
##                                      (Other)              :  20    
##                  Road_Surface_Conditions             Special_Conditions_at_Site
##  Data missing or out of range:  5        None                     :1364        
##  Dry                         :968        Roadworks                :  35        
##  Flood over 3cm. deep        :  9        Mud                      :  13        
##  Frost or ice                : 33        Road surface defective   :  13        
##  Snow                        : 11        Auto traffic signal - out:   5        
##  Wet or damp                 :412        Oil or diesel            :   5        
##                                          (Other)                  :   3        
##                                       Carriageway_Hazards Urban_or_Rural_Area
##  Any animal in carriageway (except ridden horse):  17     Rural:1308         
##  Data missing or out of range                   :   1     Urban: 130         
##  None                                           :1391                        
##  Other object on road                           :  16                        
##  Pedestrian in carriageway - not injured        :   4                        
##  Previous accident                              :   7                        
##  Vehicle load on road                           :   2                        
##                                                    Did_Police_Officer_Attend_Scene_of_Accident
##  No                                                                      : 173                
##  No - accident was reported using a self completion  form (self rep only):   3                
##  Yes                                                                     :1262                
##                                                                                               
##                                                                                               
##                                                                                               
##                                                                                               
##     Season        Month          Hour           Day_Period 
##  Fall  :262   Apr    :206   Min.   : 0.00   Night    :123  
##  Spring:488   Jun    :152   1st Qu.:10.00   Morning  :378  
##  Summer:368   Mar    :134   Median :15.00   Afternoon:755  
##  Winter:320   Aug    :132   Mean   :13.67   Evening  :182  
##               Sep    :130   3rd Qu.:17.00                  
##               May    :125   Max.   :23.00                  
##               (Other):559                                  
##      Casualties_Class Number_Vehicles_Class    cluster 
##  Small       :1270    Small :1210           Min.   :1  
##  Medium      : 137    Medium: 228           1st Qu.:1  
##  Large       :  31    Large :   0           Median :1  
##  Overweelming:   0                          Mean   :1  
##                                             3rd Qu.:1  
##                                             Max.   :1  
##                                                        
## 
## [[2]]
##    Longitude           Latitude                  Police_Force Accident_Severity
##  Min.   :-5.28016   Min.   :50.23   Metropolitan Police:358   Fatal  : 12      
##  1st Qu.:-1.81518   1st Qu.:51.45   Thames Valley      : 42   Serious:122      
##  Median :-0.42135   Median :51.56   Kent               : 39   Slight :884      
##  Mean   :-0.98840   Mean   :52.18   Sussex             : 37                    
##  3rd Qu.:-0.09676   3rd Qu.:52.72   Hampshire          : 33                    
##  Max.   : 1.74888   Max.   :57.44   West Midlands      : 29                    
##                                     (Other)            :480                    
##  Number_of_Vehicles    Day_of_Week   Local_Authority_District
##  Min.   :1.000      Friday   :144   Westminster  : 25        
##  1st Qu.:1.000      Monday   : 94   Croydon      : 23        
##  Median :2.000      Saturday :140   Tower Hamlets: 22        
##  Mean   :1.788      Sunday   :125   Hackney      : 20        
##  3rd Qu.:2.000      Thursday :137   Lambeth      : 20        
##  Max.   :8.000      Tuesday  :232   Camden       : 18        
##                     Wednesday:146   (Other)      :890        
##      First_Road_Class First_Road_Number              Road_Type    Speed_limit 
##  A           :628     Min.   :   0.00   Dual carriageway  :139   30     :814  
##  A(M)        :  3     1st Qu.:  10.25   One way street    : 23   40     : 88  
##  B           :112     Median : 231.50   Roundabout        :112   70     : 36  
##  C           : 88     Mean   :1043.48   Single carriageway:731   50     : 35  
##  Motorway    : 16     3rd Qu.: 976.75   Slip road         : 10   60     : 27  
##  Unclassified:171     Max.   :9113.00   Unknown           :  3   20     : 18  
##                                                                  (Other):  0  
##                             Junction_Detail
##  T or staggered junction            :445   
##  Crossroads                         :154   
##  Roundabout                         :149   
##  Not at junction or within 20 metres:142   
##  Private drive or entrance          : 37   
##  Mini-roundabout                    : 27   
##  (Other)                            : 64   
##                      Junction_Control Second_Road_Number
##  Authorised person           :  0     Min.   :  -1.0    
##  Auto traffic signal         :180     1st Qu.:   0.0    
##  Data missing or out of range:142     Median :   0.0    
##  Give way or uncontrolled    :689     Mean   : 584.7    
##  Stop sign                   :  7     3rd Qu.: 111.8    
##                                       Max.   :9501.0    
##                                                         
##                   Pedestrian_Crossing-Human_Control
##  Control by other authorised person:   5           
##  Control by school crossing patrol :   0           
##  Data missing or out of range      :   0           
##  None within 50 metres             :1013           
##                                                    
##                                                    
##                                                    
##                                                       Pedestrian_Crossing-Physical_Facilities
##  Central refuge                                                           : 33               
##  Data missing or out of range                                             :  0               
##  Footbridge or subway                                                     :  7               
##  No physical crossing facilities within 50 metres                         :713               
##  Pedestrian phase at traffic signal junction                              :140               
##  Pelican, puffin, toucan or similar non-junction pedestrian light crossing: 75               
##  Zebra                                                                    : 50               
##                     Light_Conditions             Weather_Conditions
##  Darkness - lighting unknown: 35     Fine no high winds   :776     
##  Darkness - lights lit      :820     Raining no high winds:143     
##  Darkness - lights unlit    :  4     Unknown              : 27     
##  Darkness - no lighting     : 37     Raining + high winds : 25     
##  Daylight                   :122     Fine + high winds    : 20     
##                                      Other                : 18     
##                                      (Other)              :  9     
##                  Road_Surface_Conditions
##  Data missing or out of range:  1       
##  Dry                         :647       
##  Flood over 3cm. deep        :  0       
##  Frost or ice                : 16       
##  Snow                        :  6       
##  Wet or damp                 :348       
##                                         
##                               Special_Conditions_at_Site
##  None                                      :1003        
##  Roadworks                                 :   7        
##  Auto traffic signal - out                 :   5        
##  Mud                                       :   1        
##  Road sign or marking defective or obscured:   1        
##  Road surface defective                    :   1        
##  (Other)                                   :   0        
##                                       Carriageway_Hazards Urban_or_Rural_Area
##  Any animal in carriageway (except ridden horse):   6     Rural:158          
##  Data missing or out of range                   :   0     Urban:860          
##  None                                           :1006                        
##  Other object on road                           :   2                        
##  Pedestrian in carriageway - not injured        :   3                        
##  Previous accident                              :   0                        
##  Vehicle load on road                           :   1                        
##                                                    Did_Police_Officer_Attend_Scene_of_Accident
##  No                                                                      :179                 
##  No - accident was reported using a self completion  form (self rep only):  3                 
##  Yes                                                                     :836                 
##                                                                                               
##                                                                                               
##                                                                                               
##                                                                                               
##     Season        Month          Hour           Day_Period 
##  Fall  :288   Jan    :226   Min.   : 0.00   Night    :114  
##  Spring:135   Feb    :140   1st Qu.:17.00   Morning  : 27  
##  Summer:113   Dec    :125   Median :19.00   Afternoon:285  
##  Winter:482   Nov    :112   Mean   :17.27   Evening  :592  
##               Mar    :104   3rd Qu.:21.00                  
##               Oct    : 68   Max.   :23.00                  
##               (Other):243                                  
##      Casualties_Class Number_Vehicles_Class    cluster 
##  Small       :972     Small :957            Min.   :2  
##  Medium      : 38     Medium: 61            1st Qu.:2  
##  Large       :  8     Large :  0            Median :2  
##  Overweelming:  0                           Mean   :2  
##                                             3rd Qu.:2  
##                                             Max.   :2  
##                                                        
## 
## [[3]]
##    Longitude           Latitude                  Police_Force Accident_Severity
##  Min.   :-5.43716   Min.   :50.11   Metropolitan Police:643   Fatal  :  13     
##  1st Qu.:-1.64347   1st Qu.:51.45   Thames Valley      : 75   Serious: 234     
##  Median :-0.40139   Median :51.57   Sussex             : 66   Slight :1637     
##  Mean   :-0.92433   Mean   :52.17   Hampshire          : 60                    
##  3rd Qu.:-0.09699   3rd Qu.:52.90   Kent               : 58                    
##  Max.   : 1.74594   Max.   :57.64   Surrey             : 54                    
##                                     (Other)            :928                    
##  Number_of_Vehicles    Day_of_Week  Local_Authority_District
##  Min.   :1.000      Friday   :465   Westminster:  39        
##  1st Qu.:2.000      Monday   :196   Lambeth    :  33        
##  Median :2.000      Saturday :223   Brent      :  32        
##  Mean   :1.902      Sunday   :184   Wandsworth :  31        
##  3rd Qu.:2.000      Thursday :313   Croydon    :  29        
##  Max.   :6.000      Tuesday  :250   Islington  :  28        
##                     Wednesday:253   (Other)    :1692        
##      First_Road_Class First_Road_Number              Road_Type   
##  A           :1144    Min.   :   0      Dual carriageway  : 233  
##  A(M)        :   5    1st Qu.:  11      One way street    :  34  
##  B           : 220    Median : 238      Roundabout        : 213  
##  C           : 186    Mean   :1087      Single carriageway:1379  
##  Motorway    :  23    3rd Qu.:1041      Slip road         :  20  
##  Unclassified: 306    Max.   :9125      Unknown           :   5  
##                                                                  
##   Speed_limit                              Junction_Detail
##  30     :1398   T or staggered junction            :921   
##  40     : 166   Roundabout                         :259   
##  60     : 128   Crossroads                         :249   
##  50     :  77   Not at junction or within 20 metres:194   
##  70     :  63   Private drive or entrance          :101   
##  20     :  52   Other junction                     : 54   
##  (Other):   0   (Other)                            :106   
##                      Junction_Control Second_Road_Number
##  Authorised person           :   1    Min.   :  -1.0    
##  Auto traffic signal         : 296    1st Qu.:   0.0    
##  Data missing or out of range: 195    Median :   0.0    
##  Give way or uncontrolled    :1379    Mean   : 531.0    
##  Stop sign                   :  13    3rd Qu.: 112.2    
##                                       Max.   :9556.0    
##                                                         
##                   Pedestrian_Crossing-Human_Control
##  Control by other authorised person:   7           
##  Control by school crossing patrol :   8           
##  Data missing or out of range      :   0           
##  None within 50 metres             :1869           
##                                                    
##                                                    
##                                                    
##                                                       Pedestrian_Crossing-Physical_Facilities
##  Central refuge                                                           :  49              
##  Data missing or out of range                                             :   0              
##  Footbridge or subway                                                     :   7              
##  No physical crossing facilities within 50 metres                         :1379              
##  Pedestrian phase at traffic signal junction                              : 226              
##  Pelican, puffin, toucan or similar non-junction pedestrian light crossing: 136              
##  Zebra                                                                    :  87              
##                     Light_Conditions             Weather_Conditions
##  Darkness - lighting unknown:  10    Fine no high winds   :1632    
##  Darkness - lights lit      :  63    Raining no high winds: 154    
##  Darkness - lights unlit    :   3    Unknown              :  33    
##  Darkness - no lighting     :  11    Other                :  21    
##  Daylight                   :1797    Fine + high winds    :  16    
##                                      Raining + high winds :  13    
##                                      (Other)              :  15    
##                  Road_Surface_Conditions
##  Data missing or out of range:   4      
##  Dry                         :1505      
##  Flood over 3cm. deep        :   1      
##  Frost or ice                :  13      
##  Snow                        :   5      
##  Wet or damp                 : 356      
##                                         
##               Special_Conditions_at_Site
##  None                      :1852        
##  Roadworks                 :  18        
##  Oil or diesel             :   6        
##  Auto traffic signal - out :   3        
##  Road surface defective    :   2        
##  Auto signal part defective:   1        
##  (Other)                   :   2        
##                                       Carriageway_Hazards Urban_or_Rural_Area
##  Any animal in carriageway (except ridden horse):   3     Rural: 438         
##  Data missing or out of range                   :   1     Urban:1446         
##  None                                           :1867                        
##  Other object on road                           :   7                        
##  Pedestrian in carriageway - not injured        :   3                        
##  Previous accident                              :   3                        
##  Vehicle load on road                           :   0                        
##                                                    Did_Police_Officer_Attend_Scene_of_Accident
##  No                                                                      : 375                
##  No - accident was reported using a self completion  form (self rep only):   2                
##  Yes                                                                     :1507                
##                                                                                               
##                                                                                               
##                                                                                               
##                                                                                               
##     Season        Month          Hour           Day_Period  
##  Fall  :387   Jul    :410   Min.   : 0.00   Night    : 110  
##  Spring:254   Sep    :246   1st Qu.: 8.00   Morning  :1263  
##  Summer:909   Aug    :226   Median :11.00   Afternoon: 423  
##  Winter:334   Jun    :152   Mean   :10.97   Evening  :  88  
##               Mar    :139   3rd Qu.:13.00                   
##               Nov    :135   Max.   :23.00                   
##               (Other):576                                   
##      Casualties_Class Number_Vehicles_Class    cluster 
##  Small       :1800    Small :1714           Min.   :3  
##  Medium      :  71    Medium: 170           1st Qu.:3  
##  Large       :  12    Large :   0           Median :3  
##  Overweelming:   1                          Mean   :3  
##                                             3rd Qu.:3  
##                                             Max.   :3  
##                                                        
## 
## [[4]]
##    Longitude          Latitude                  Police_Force Accident_Severity
##  Min.   :-5.4311   Min.   :50.17   West Midlands      :197   Fatal  :  10     
##  1st Qu.:-2.3917   1st Qu.:51.51   Metropolitan Police: 81   Serious: 169     
##  Median :-1.7967   Median :52.47   Hampshire          : 54   Slight :1058     
##  Mean   :-1.6770   Mean   :52.58   Devon and Cornwall : 43                    
##  3rd Qu.:-0.8813   3rd Qu.:53.42   Greater Manchester : 42                    
##  Max.   : 1.7537   Max.   :57.48   Lancashire         : 42                    
##                                    (Other)            :778                    
##  Number_of_Vehicles    Day_of_Week      Local_Authority_District
##  Min.   :1.000      Friday   :116   Birmingham      : 111       
##  1st Qu.:2.000      Monday   :356   Coventry        :  18       
##  Median :2.000      Saturday :160   Sandwell        :  16       
##  Mean   :1.863      Sunday   :131   Bristol, City of:  15       
##  3rd Qu.:2.000      Thursday :188   Liverpool       :  15       
##  Max.   :5.000      Tuesday  :123   Nottingham      :  14       
##                     Wednesday:163   (Other)         :1048       
##      First_Road_Class First_Road_Number              Road_Type   
##  A           :264     Min.   :   0      Dual carriageway  :  82  
##  A(M)        :  4     1st Qu.:   0      One way street    :  30  
##  B           :189     Median :   5      Roundabout        :  82  
##  C           : 95     Mean   :1069      Single carriageway:1029  
##  Motorway    :  9     3rd Qu.: 813      Slip road         :   5  
##  Unclassified:676     Max.   :9304      Unknown           :   9  
##                                                                  
##   Speed_limit                              Junction_Detail
##  30     :1022   T or staggered junction            :702   
##  40     :  73   Crossroads                         :163   
##  60     :  54   Roundabout                         :113   
##  20     :  53   Not at junction or within 20 metres: 89   
##  50     :  20   Private drive or entrance          : 67   
##  70     :  15   Other junction                     : 47   
##  (Other):   0   (Other)                            : 56   
##                      Junction_Control Second_Road_Number
##  Authorised person           :   0    Min.   :  -1.0    
##  Auto traffic signal         : 118    1st Qu.:   0.0    
##  Data missing or out of range:  89    Median :   0.0    
##  Give way or uncontrolled    :1026    Mean   : 516.5    
##  Stop sign                   :   4    3rd Qu.:   0.0    
##                                       Max.   :9219.0    
##                                                         
##                   Pedestrian_Crossing-Human_Control
##  Control by other authorised person:   6           
##  Control by school crossing patrol :   3           
##  Data missing or out of range      :   0           
##  None within 50 metres             :1228           
##                                                    
##                                                    
##                                                    
##                                                       Pedestrian_Crossing-Physical_Facilities
##  Central refuge                                                           :  38              
##  Data missing or out of range                                             :   0              
##  Footbridge or subway                                                     :   2              
##  No physical crossing facilities within 50 metres                         :1013              
##  Pedestrian phase at traffic signal junction                              :  83              
##  Pelican, puffin, toucan or similar non-junction pedestrian light crossing:  75              
##  Zebra                                                                    :  26              
##                     Light_Conditions             Weather_Conditions
##  Darkness - lighting unknown:  16    Fine no high winds   :1034    
##  Darkness - lights lit      : 136    Raining no high winds: 117    
##  Darkness - lights unlit    :   4    Unknown              :  26    
##  Darkness - no lighting     :  18    Fine + high winds    :  20    
##  Daylight                   :1063    Other                :  18    
##                                      Raining + high winds :  16    
##                                      (Other)              :   6    
##                  Road_Surface_Conditions
##  Data missing or out of range:  3       
##  Dry                         :975       
##  Flood over 3cm. deep        :  0       
##  Frost or ice                :  2       
##  Snow                        :  1       
##  Wet or damp                 :256       
##                                         
##                               Special_Conditions_at_Site
##  None                                      :1221        
##  Roadworks                                 :   8        
##  Road sign or marking defective or obscured:   4        
##  Oil or diesel                             :   2        
##  Auto traffic signal - out                 :   1        
##  Road surface defective                    :   1        
##  (Other)                                   :   0        
##                                       Carriageway_Hazards Urban_or_Rural_Area
##  Any animal in carriageway (except ridden horse):   0     Rural: 209         
##  Data missing or out of range                   :   0     Urban:1028         
##  None                                           :1229                        
##  Other object on road                           :   5                        
##  Pedestrian in carriageway - not injured        :   2                        
##  Previous accident                              :   1                        
##  Vehicle load on road                           :   0                        
##                                                    Did_Police_Officer_Attend_Scene_of_Accident
##  No                                                                      :281                 
##  No - accident was reported using a self completion  form (self rep only):  6                 
##  Yes                                                                     :950                 
##                                                                                               
##                                                                                               
##                                                                                               
##                                                                                               
##     Season        Month          Hour           Day_Period 
##  Fall  :243   May    :287   Min.   : 0.00   Night    : 52  
##  Spring:622   Apr    :156   1st Qu.:13.00   Morning  :199  
##  Summer:172   Jun    :143   Median :15.00   Afternoon:868  
##  Winter:200   Mar    :122   Mean   :14.62   Evening  :118  
##               Nov    : 91   3rd Qu.:17.00                  
##               Sep    : 82   Max.   :23.00                  
##               (Other):356                                  
##      Casualties_Class Number_Vehicles_Class    cluster 
##  Small       :1151    Small :1134           Min.   :4  
##  Medium      :  74    Medium: 103           1st Qu.:4  
##  Large       :  12    Large :   0           Median :4  
##  Overweelming:   0                          Mean   :4  
##                                             3rd Qu.:4  
##                                             Max.   :4  
##                                                        
## 
## [[5]]
##    Longitude          Latitude                  Police_Force Accident_Severity
##  Min.   :-5.2256   Min.   :50.23   West Yorkshire     :175   Fatal  :  18     
##  1st Qu.:-2.6191   1st Qu.:51.57   Metropolitan Police: 96   Serious: 219     
##  Median :-1.5835   Median :53.05   Strathclyde        : 68   Slight :1186     
##  Mean   :-1.6915   Mean   :52.97   Lancashire         : 60                    
##  3rd Qu.:-0.7561   3rd Qu.:53.78   Merseyside         : 57                    
##  Max.   : 1.7336   Max.   :58.59   Thames Valley      : 55                    
##                                    (Other)            :912                    
##  Number_of_Vehicles    Day_of_Week  Local_Authority_District
##  Min.   : 1.00      Friday   :208   Leeds       :  75       
##  1st Qu.: 1.00      Monday   :171   Kirklees    :  30       
##  Median : 2.00      Saturday :187   Liverpool   :  28       
##  Mean   : 1.75      Sunday   :141   Bradford    :  27       
##  3rd Qu.: 2.00      Thursday :230   Wakefield   :  27       
##  Max.   :10.00      Tuesday  :340   Glasgow City:  25       
##                     Wednesday:146   (Other)     :1211       
##      First_Road_Class First_Road_Number              Road_Type   
##  A           :349     Min.   :   0.0    Dual carriageway  : 191  
##  A(M)        : 12     1st Qu.:   0.0    One way street    :  47  
##  B           :165     Median :  19.0    Roundabout        :  28  
##  C           : 93     Mean   : 984.9    Single carriageway:1123  
##  Motorway    : 73     3rd Qu.: 649.5    Slip road         :  14  
##  Unclassified:731     Max.   :9711.0    Unknown           :  20  
##                                                                  
##   Speed_limit                              Junction_Detail
##  30     :1072   Not at junction or within 20 metres:1184  
##  70     :  92   Crossroads                         :  84  
##  40     :  86   T or staggered junction            :  56  
##  20     :  77   Roundabout                         :  36  
##  60     :  64   Private drive or entrance          :  26  
##  50     :  32   Other junction                     :  17  
##  (Other):   0   (Other)                            :  20  
##                      Junction_Control Second_Road_Number
##  Authorised person           :   1    Min.   :  -1.0    
##  Auto traffic signal         :  82    1st Qu.:   0.0    
##  Data missing or out of range:1184    Median :   0.0    
##  Give way or uncontrolled    : 152    Mean   :  85.6    
##  Stop sign                   :   4    3rd Qu.:   0.0    
##                                       Max.   :7201.0    
##                                                         
##                   Pedestrian_Crossing-Human_Control
##  Control by other authorised person:   3           
##  Control by school crossing patrol :   5           
##  Data missing or out of range      :   3           
##  None within 50 metres             :1412           
##                                                    
##                                                    
##                                                    
##                                                       Pedestrian_Crossing-Physical_Facilities
##  Central refuge                                                           :  27              
##  Data missing or out of range                                             :   3              
##  Footbridge or subway                                                     :   1              
##  No physical crossing facilities within 50 metres                         :1216              
##  Pedestrian phase at traffic signal junction                              :  70              
##  Pelican, puffin, toucan or similar non-junction pedestrian light crossing:  62              
##  Zebra                                                                    :  44              
##                     Light_Conditions             Weather_Conditions
##  Darkness - lighting unknown:  16    Fine no high winds   :1157    
##  Darkness - lights lit      : 258    Raining no high winds: 155    
##  Darkness - lights unlit    :   8    Unknown              :  29    
##  Darkness - no lighting     :  57    Fine + high winds    :  24    
##  Daylight                   :1084    Raining + high winds :  24    
##                                      Other                :  18    
##                                      (Other)              :  16    
##                  Road_Surface_Conditions
##  Data missing or out of range:   5      
##  Dry                         :1025      
##  Flood over 3cm. deep        :   2      
##  Frost or ice                :   6      
##  Snow                        :   5      
##  Wet or damp                 : 380      
##                                         
##                 Special_Conditions_at_Site
##  None                        :1378        
##  Roadworks                   :  23        
##  Mud                         :   7        
##  Oil or diesel               :   4        
##  Road surface defective      :   4        
##  Data missing or out of range:   3        
##  (Other)                     :   4        
##                                       Carriageway_Hazards Urban_or_Rural_Area
##  Any animal in carriageway (except ridden horse):   6     Rural: 305         
##  Data missing or out of range                   :   3     Urban:1118         
##  None                                           :1387                        
##  Other object on road                           :  19                        
##  Pedestrian in carriageway - not injured        :   5                        
##  Previous accident                              :   1                        
##  Vehicle load on road                           :   2                        
##                                                    Did_Police_Officer_Attend_Scene_of_Accident
##  No                                                                      : 367                
##  No - accident was reported using a self completion  form (self rep only):   8                
##  Yes                                                                     :1048                
##                                                                                               
##                                                                                               
##                                                                                               
##                                                                                               
##     Season        Month          Hour           Day_Period 
##  Fall  :727   Oct    :320   Min.   : 0.00   Night    : 88  
##  Spring:179   Nov    :193   1st Qu.:10.00   Morning  :423  
##  Summer:282   Dec    :163   Median :14.00   Afternoon:759  
##  Winter:235   Sep    :140   Mean   :13.53   Evening  :153  
##               Jun    :101   3rd Qu.:17.00                  
##               Aug    :100   Max.   :23.00                  
##               (Other):406                                  
##      Casualties_Class Number_Vehicles_Class    cluster 
##  Small       :1344    Small :1280           Min.   :5  
##  Medium      :  67    Medium: 142           1st Qu.:5  
##  Large       :  12    Large :   1           Median :5  
##  Overweelming:   0                          Mean   :5  
##                                             3rd Qu.:5  
##                                             Max.   :5  
## 

Visualization of the cluster in a lower dimensional space

tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
  data.frame() %>%
  setNames(c("X", "Y")) %>%
  mutate(cluster = factor(pam_fit$clustering))
ggplot(aes(x = X, y = Y), data = tsne_data) +
  geom_point(aes(color = cluster))

  • Note: Is possoble to observe a dense red area, confirming the relevancy of the segmentation.

Lets try some Clustering and Analyse over time (trimester by trimester)

# Create the array list to iterate
# Considering that we are subsetting, we can handle the subset all. condideting dates, lets make motn by mothn

# Lets set out array to iterate
month_array <- c('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec')

# Iterate and do things
for ( month in month_array){ 
  
  print(month)
  
  # Some intial colum name analisis
  (colnames(df_subset))
  
  # Lets drop initialy the time to see things
  df_notime_temp <- df_subset[, -c('Date', 'Time', 'Location_Easting_OSGR', 'Location_Northing_OSGR',
                              'LSOA_of_Accident_Location','Second_Road_Class', 'Number_of_Casualties')]

  # Access the number of levels
  (apply(df_notime_temp, 2, function(x) length( unique(x) )))

  # Access the incomplete cases
  (apply(is.na(df_notime_temp), 2, sum))

  # Count incomplete cases
  sum(apply(df_notime_temp, 1, function(x){any(is.na(x))}))
 
  # Remove those incomplete cases in small number
  df_notime_temp <- na.omit(df_notime_temp)
  
  # Convert chars to factors in this reduced space
  df_notime_temp <- mutate_if(df_notime_temp, is.character, as.factor)

  # Compute Gower distance
  gower_dist <- daisy(df_notime_temp, metric = "gower")
  gower_mat <- as.matrix(gower_dist)
  
  # Do pam clustering Medoid
  k <- 5
  pam_fit <- pam(gower_dist, diss = TRUE, k)
  pam_results <- df_notime_sample %>%
    mutate(cluster = pam_fit$clustering) %>%
    group_by(cluster) %>%
    do(the_summary = summary(.))
  pam_results$the_summary
  
  # Do some representation
  tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
  tsne_data <- tsne_obj$Y %>%
  data.frame() %>%
    setNames(c("X", "Y")) %>%
    mutate(cluster = factor(pam_fit$clustering))

  ggplot(aes(x = X, y = Y), data = tsne_data) +
    geom_point(aes(color = cluster))
  
  
}